模組:headword/data
Guā-māu
可在模組:headword/data/doc建立此模組的說明文件
local data = {}
local u = mw.ustring.char
local rsubn = mw.ustring.gsub
local frame = mw.getCurrentFrame()
local title = mw.title.getCurrentTitle()
-- Version of rsubn() that discards all but the first return value.
local function rsub(term, foo, bar)
return (rsubn(term, foo, bar))
end
local function track(track_id)
local tracking_page = "headword/" .. track_id
local m_debug_track = require("Module:debug/track")
m_debug_track(tracking_page)
return true
end
------ 1. Lists that will be converted into sets. ------
data.invariable = {
"cmavo",
"cmene",
"fu'ivla",
"gismu",
"Han tu","漢字","汉字",
"hanzi","漢字","汉字",
"hanja","漢字","汉字",
"jyutping","粤语拼音","粵語拼音",
"kanji","漢字","汉字",
"lujvo",
"phrasebook",
"pinyin","拼音",
"rafsi",
"romaji","羅馬字","罗马字",
}
data.lemmas = {
"abbreviation","縮寫","缩写",
"acronym","首字母縮略詞","首字母缩略词",
"adjective","形容詞","形容词",
"adnominal","連體詞",
"adposition",
"adverb","副詞","副词",
"affix","綴詞","缀词",
"ambiposition",
"article","冠詞","冠词",
"chengyu","成語","成语",
"circumfix","環綴","环缀",
"circumposition",
"classifier","量詞","量词",
"cmavo",
"cmavo cluster",
"cmene",
"conjunction","連詞","连词",
"counter","量詞","量词",
"determiner","限定詞","限定词",
"diacritical mark","附加符號","附加符号",
"equative adjective",
"fu'ivla",
"gismu",
"Han character","漢字","汉字",
"Han tu","漢字","汉字",
"hanzi","漢字","汉字",
"hanja","漢字","汉字",
"ideophones","擬態詞",
"idiom","熟語","熟语","俗語","俗语",
"infix","中綴","中缀",
"interfix","間綴","间缀",
"initialism","首字母縮略詞","首字母缩略词",
"interjection","感嘆詞","感叹词","感歎詞",
"kanji","漢字","汉字",
"letter","字母",
"ligature","合字",
"lujvo",
"morpheme","詞素","词素",
"non-constituent",
"noun","名詞","名词",
"number","數字","数字",
"numeral symbol","數字符號","数字符号",
"numeral","數詞","数词",
"particle","助詞","助词",
"phrase","短語","短语",
"postposition","後置詞", "后置词",
"predicative", "表語", "表语",
"prefix","前綴","前缀",
"preposition","介詞","介词",
"prepositional phrase","介詞短語","介词短语",
"preverb",
"pronominal adverb",
"pronoun","代詞","代词",
"proverb","諺語","谚语",
"proper noun","專有名詞","专有名词",
"punctuation mark","標點符號", "标点符号",
"relative","關係詞",
"root","詞根","词根",
"stem","詞幹","词干",
"suffix","後綴","后缀",
"superlative adjective",
"superlative adverb",
"syllable","音節","音节",
"symbol","符號","符号",
"verb","動詞","动词",
}
data.nonlemmas = {
"active participle","主動分詞","主动分词",
"adjectival participle",
"adjective form","形容詞形式","形容词形式","形容詞變格形","形容词变格形",
"adjective comparative form","形容詞比較級變格形","形容词比较级变格形",
"adjective feminine form","形容詞陰性變格形","形容词阴性变格形",
"adjective equative form",
"adjective plural form","形容詞複數變格形","形容词复数变格形",
"adjective superlative form","形容詞最高級變格形","形容词最高级变格形",
"adverb form","副詞形式","副词形式","副詞變格形","副词变格形",
"adverb comparative form","副詞比較級變格形","副词比较级变格形",
"adverb superlative form","副詞最高級變格形","副词最高级变格形",
"adverbial participle",
"agent participle",
"article form","冠詞形式","冠词形式","冠詞變格形","冠词变格形",
"circumfix form",
"combined form",
"comparative adjective", "形容词比较级", "形容詞比較級",
"comparative adverb", "副词比较级", "副詞比較級",
"contraction","縮約形","缩约形",
"converb",
"determiner comparative form","限定詞比較級變格形","限定词比较级变格形",
"determiner form","限定詞形式","限定词形式","限定詞變格形","限定词变格形",
"determiner superlative form","限定詞最高級變格形","限定词最高级变格形",
"diminutive noun","名詞指小詞","名词指小词",
"future participle",
"gerund","動名詞","动名词",
"infinitive form","不定式",
"infinitive",
"interjection form","感嘆詞形式","感叹词形式","感嘆詞變格形","感叹词变格形",
"jyutping",
"kanji reading",
"misspelling","拼錯","拼错","拼寫錯誤","拼写错误",
"negative participle","否定分詞","否定分词",
"nominal participle",
"noun case form",
"noun dual form","名詞雙數形式",
"noun form","名詞形式","名词形式","名詞變格形","名词变格形",
"noun plural form","名詞複數形式","名词复数形式",
"noun possessive form","名詞所有格","名词所有格",
"noun singulative form",
"numeral form","數詞形式","数词形式","數詞變格形","数词变格形",
"participle","分詞","分词",
"participle form","分詞變格形","分词变格形",
"particle form","助詞形式","助词形式","助詞變格形","助词变格形",
"passive participle","被動分詞","被动分词",
"past active participle","過去主動分詞","过去主动分词",
"past participle","過去分詞","过去分词",
"past participle form","過去分詞變格形","过去分词变格形","過去分詞形式","过去分词形式",
"past passive participle","過去被動分詞","过去被动分词",
"perfect active participle","完成主動分詞","完成主动分词",
"perfect participle","完成分詞","完成分词",
"perfect passive participle","完成被動分詞","完成被动分词",
"pinyin","拼音",
"plural","複數","复数",
"postposition form","後置詞變格形","后置词变格形",
"prefix form","前綴變格形","前缀变格形",
"preposition contraction",
"preposition form","介詞形式","介词形式","介詞變格形","介词变格形",
"prepositional pronoun","介詞性代詞",
"present active participle","現在主動分詞","现在主动分词",
"present participle","現在分詞","现在分词",
"present passive participle","現在被動分詞","现在被动分词",
"pronoun form","代詞形式","代词形式","代詞變格形","代词变格形",
"pronoun possessive form",
"proper noun form","專有名詞形式","专有名词形式","專有名詞變格形","专有名词变格形",
"proper noun plural form","專有名詞複數形式","专有名词复数形式",
"rafsi",
"romanization","羅馬化","罗马化",
"romaji","羅馬字","罗马字",
"singulative",
"suffix form","後綴變格形","后缀变格形",
"verb form","動詞形式","动词形式","動詞變位形式","动词变位形式",
"verbal noun","動名詞","动名词",
}
-- These langauges will not have links to separate parts of the headword.
data.no_multiword_links = {
"zh",
}
-- These languages will not have "LANG multiword terms" categories added.
data.no_multiword_cat = {
-------- Languages without spaces between words (sometimes spaces between phrases) --------
"blt", -- Tai Dam
"ja", -- Japanese
"khb", -- Lü
"km", -- Khmer
"lo", -- Lao
"mnw", -- Mon
"my", -- Burmese
"nan", -- Min Nan (some words in Latin script; hyphens between syllables)
"nod", -- Northern Thai
"ojp", -- Old Japanese
"shn", -- Shan
"sou", -- Southern Thai
"tdd", -- Tai Nüa
"th", -- Thai
"tts", -- Isan
"twh", -- Tai Dón
"txg", -- Tangut
"zh", -- Chinese (all varieties with Chinese characters)
"zkt", -- Khitan
-------- Languages with spaces between syllables --------
"ahk", -- Akha
"aou", -- A'ou
"atb", -- Zaiwa
"byk", -- Biao
"cdy", -- Chadong
--"duu", -- Drung; not sure
--"hmx-pro", -- Proto-Hmong-Mien
--"hnj", -- Green Hmong; not sure
"huq", -- Tsat
"ium", -- Iu Mien
--"lis", -- Lisu; not sure
"mtq", -- Muong
--"mww", -- White Hmong; not sure
"onb", -- Lingao
--"sit-gkh", -- Gokhy; not sure
--"swi", -- Sui; not sure
"tbq-lol-pro", -- Proto-Loloish
"tdh", -- Thulung
"ukk", -- Muak Sa-aak
"vi", -- Vietnamese
"yig", -- Wusa Nasu
"zng", -- Mang
-------- Languages with ~ with surrounding spaces used to separate variants --------
"mkh-ban-pro", -- Proto-Bahnaric
"sit-pro", -- Proto-Sino-Tibetan; listed above
-------- Other weirdnesses --------
"mul", -- Translingual; gestures, Morse code, etc.
"aot", -- Atong (India); bullet is a letter
-------- All sign languages --------
"ads",
"aed",
"aen",
"afg",
"ase",
"asf",
"asp",
"asq",
"asw",
"bfi",
"bfk",
"bog",
"bqn",
"bqy",
"bvl",
"bzs",
"cds",
"csc",
"csd",
"cse",
"csf",
"csg",
"csl",
"csn",
"csq",
"csr",
"doq",
"dse",
"dsl",
"ecs",
"esl",
"esn",
"eso",
"eth",
"fcs",
"fse",
"fsl",
"fss",
"gds",
"gse",
"gsg",
"gsm",
"gss",
"gus",
"hab",
"haf",
"hds",
"hks",
"hos",
"hps",
"hsh",
"hsl",
"icl",
"iks",
"ils",
"inl",
"ins",
"ise",
"isg",
"isr",
"jcs",
"jhs",
"jls",
"jos",
"jsl",
"jus",
"kgi",
"kvk",
"lbs",
"lls",
"lsl",
"lso",
"lsp",
"lst",
"lsy",
"lws",
"mdl",
"mfs",
"mre",
"msd",
"msr",
"mzc",
"mzg",
"mzy",
"nbs",
"ncs",
"nsi",
"nsl",
"nsp",
"nsr",
"nzs",
"okl",
"pgz",
"pks",
"prl",
"prz",
"psc",
"psd",
"psg",
"psl",
"pso",
"psp",
"psr",
"pys",
"rms",
"rsl",
"rsm",
"sdl",
"sfb",
"sfs",
"sgg",
"sgx",
"slf",
"sls",
"sqk",
"sqs",
"ssp",
"ssr",
"svk",
"swl",
"syy",
"tse",
"tsm",
"tsq",
"tss",
"tsy",
"tza",
"ugn",
"ugy",
"ukl",
"uks",
"vgt",
"vsi",
"vsl",
"vsv",
"xki",
"xml",
"xms",
"ygs",
"ysl",
"zib",
"zsl",
}
-- In these languages, the hyphen is not considered a word separator for the "multiword terms" category.
data.hyphen_not_multiword_sep = {
"akk", -- Akkadian; hyphens between syllables
"akl", -- Aklanon; hyphens for mid-word glottal stops
"ber-pro", -- Proto-Berber; morphemes separated by hyphens
"ceb", -- Cebuano; hyphens for mid-word glottal stops
"cnk", -- Khumi Chin; hyphens used in single words
"cpi", -- Chinese Pidgin English; Chinese-derived words with hyphens between syllables
"de", -- too many false positives
"esx-esk-pro", -- hyphen used to separate morphemes
"fi", -- Finnish; hyphen used to separate components in compound words if the final and initial vowels match, respectively
"hil", -- Hiligaynon; hyphens for mid-word glottal stops
"ilo", -- Ilocano; hyphens for mid-word glottal stops
"lcp", -- Western Lawa; dash as syllable joiner
"lwl", -- Eastern Lawa; dash as syllable joiner
"mfa", -- Pattani Malay in Thai script; dash as syllable joiner
"mkh-vie-pro", -- Proto-Vietic; morphemes separated by hyphens
"msb", -- Masbatenyo; too many false positives
"tl", -- Tagalog; too many false positives
"war", -- Waray-Waray; too many false positives
"yo", -- Yoruba; hyphens used to show lengthened nasal vowels
}
-- These languages will not have "LANG masculine nouns" and similar categories added.
data.no_gender_cat = {
-- Languages without gender but which use the gender field for other purposes
"ja",
"th",
}
data.notranslit = {
"ams",
"az",
"bbc",
"bug",
"cia",
"cjm",
"cmn",
"hak",
"ja",
"kzg",
"lad",
"lzh",
"ms",
"mul",
"mvi",
"nan",
"oj",
"okn",
"ro",
"ryn",
"rys",
"ryu",
"sh",
"tgt",
"th",
"tkn",
"tly",
"txg",
"und",
"vi",
"xug",
"yoi",
"yox",
"yue",
"za",
"zh",
}
-- Script codes for which a script-tagged display title will be added.
data.toBeTagged = {
"Ahom",
"Arab",
"fa-Arab",
"glk-Arab",
"kk-Arab",
"ks-Arab",
"ku-Arab",
"mzn-Arab",
"ms-Arab",
"ota-Arab",
"pa-Arab",
"ps-Arab",
"sd-Arab",
"tt-Arab",
"ug-Arab",
"ur-Arab",
"Armi",
"Armn",
"Avst",
"Bali",
"Bamu",
"Batk",
"Beng",
"as-Beng",
"Bopo",
"Brah",
"Brai",
"Bugi",
"Buhd",
"Cakm",
"Cans",
"Cari",
"Cham",
"Cher",
"Copt",
"Cprt",
"Cyrl",
"Cyrs",
"Deva",
"Dsrt",
"Egyd",
"Egyp",
"Ethi",
"Geok",
"Geor",
"Glag",
"Goth",
"Grek",
"Polyt",
"Gujr",
"Guru",
"Hang",
"Hani",
"Hano",
"Hebr",
"Hira",
"Hluw",
"Ital",
"Java",
"Kali",
"Kana",
"Khar",
"Khmr",
"Knda",
"Kthi",
"Lana",
"Laoo",
"Latn",
"Latf",
"Latg",
"Latinx",
"nv-Latn",
"pjt-Latn",
"Lepc",
"Limb",
"Linb",
"Lisu",
"Lyci",
"Lydi",
"Mand",
"Mani",
"Merc",
"Mero",
"Mlym",
"Mong",
"mnc-Mong",
"sjo-Mong",
"xwo-Mong",
"Mtei",
"Mymr",
"Narb",
"Nkoo",
"Ogam",
"Olck",
"Orkh",
"Orya",
"Osma",
"Palm",
"Phag",
"Phli",
"Phlv",
"Phnx",
"Plrd",
"Prti",
"Rjng",
"Runr",
"Samr",
"Sarb",
"Saur",
"Sgnw",
"Shaw",
"Shrd",
"Sinh",
"Sora",
"Sund",
"Sylo",
"Syrc",
"Tagb",
"Tale",
"Talu",
"Taml",
"Tang",
"Tavt",
"Telu",
"Tfng",
"Tglg",
"Thaa",
"Thai",
"Tibt",
"xzh-Tibt",
"Ugar",
"Vaii",
"Xpeo",
"Xsux",
"Yiii",
"Zmth",
"Zsym",
"IPAchar",
"musical",
"Ruminumerals",
}
-- Parts of speech which will not be categorised in categories like "English terms spelled with É" if
-- the term is the character in question (e.g. the letter entry for English [[é]]). This contrasts with
-- entries like the French adjective [[m̂]], which is a one-letter word spelled with the letter.
data.pos_not_spelled_with_self = {
"diacritical marks",
"Han characters",
"Han tu",
"hanja",
"hanzi",
"kanji",
"letters",
"ligatures",
"logograms",
"numeral symbols",
"numerals",
"symbols",
}
-- Convert lists into sets.
for key, list in pairs(data) do
data[key] = {}
for _, item in ipairs(list) do
data[key][item] = true
end
end
------ 2. Lists that will not be converted into sets. ------
-- Parts of speech for which categories like "German masculine nouns" or "Russian imperfective verbs"
-- will be generated if the headword is of the appropriate gender/number.
data.pos_for_gender_number_cat = {
["名詞"] = "名詞",
["專有名詞"] = "名詞",
["後綴"] = "後綴",
-- We include verbs because impf and pf are valid "genders".
["動詞"] = "動詞",
}
-- Combining character data used when categorising unusual characters. These resolve into two patterns, used to find single combining characters (i.e. character + diacritic(s)) or double combining characters (i.e. character + diacritic(s) + character).
local comb_chars = {
single = {
{0x0300, 0x034E},
-- Exclude combining grapheme joiner.
{0x0350, 0x035B},
{0x0363, 0x036F},
{0x0483, 0x0489},
{0x0591, 0x05BD},
{0x05BF},
{0x05C1, 0x05C2},
{0x05C4, 0x05C5},
{0x05C7},
{0x0610, 0x061A},
{0x064B, 0x065F},
{0x0670},
{0x06D6, 0x06DC},
{0x06DF, 0x06E4},
{0x06E7, 0x06E8},
{0x06EA, 0x06ED},
{0x0711},
{0x0730, 0x074A},
{0x07A6, 0x07B0},
{0x07EB, 0x07F3},
{0x07FD},
{0x0816, 0x0819},
{0x081B, 0x0823},
{0x0825, 0x0827},
{0x0829, 0x082D},
{0x0859, 0x085B},
{0x0898, 0x089F},
{0x08CA, 0x08E1},
{0x08E3, 0x0902},
{0x093A},
{0x093C},
{0x0941, 0x0948},
{0x094D},
{0x0951, 0x0957},
{0x0962, 0x0963},
{0x0981},
{0x09BC},
{0x09C1, 0x09C4},
{0x09CD},
{0x09E2, 0x09E3},
{0x09FE},
{0x0A01, 0x0A02},
{0x0A3C},
{0x0A41, 0x0A42},
{0x0A47, 0x0A48},
{0x0A4B, 0x0A4D},
{0x0A51},
{0x0A70, 0x0A71},
{0x0A75},
{0x0A81, 0x0A82},
{0x0ABC},
{0x0AC1, 0x0AC5},
{0x0AC7, 0x0AC8},
{0x0ACD},
{0x0AE2, 0x0AE3},
{0x0AFA, 0x0AFF},
{0x0B01},
{0x0B3C},
{0x0B3F},
{0x0B41, 0x0B44},
{0x0B4D},
{0x0B55, 0x0B56},
{0x0B62, 0x0B63},
{0x0B82},
{0x0BC0},
{0x0BCD},
{0x0C00},
{0x0C04},
{0x0C3C},
{0x0C3E, 0x0C40},
{0x0C46, 0x0C48},
{0x0C4A, 0x0C4D},
{0x0C55, 0x0C56},
{0x0C62, 0x0C63},
{0x0C81},
{0x0CBC},
{0x0CBF},
{0x0CC6},
{0x0CCC, 0x0CCD},
{0x0CE2, 0x0CE3},
{0x0D00, 0x0D01},
{0x0D3B, 0x0D3C},
{0x0D41, 0x0D44},
{0x0D4D},
{0x0D62, 0x0D63},
{0x0D81},
{0x0DCA},
{0x0DD2, 0x0DD4},
{0x0DD6},
{0x0E31},
{0x0E34, 0x0E3A},
{0x0E47, 0x0E4E},
{0x0EB1},
{0x0EB4, 0x0EBC},
{0x0EC8, 0x0ECE},
{0x0F18, 0x0F19},
{0x0F35},
{0x0F37},
{0x0F39},
{0x0F71, 0x0F7E},
{0x0F80, 0x0F84},
{0x0F86, 0x0F87},
{0x0F8D, 0x0F97},
{0x0F99, 0x0FBC},
{0x0FC6},
{0x102D, 0x1030},
{0x1032, 0x1037},
{0x1039, 0x103A},
{0x103D, 0x103E},
{0x1058, 0x1059},
{0x105E, 0x1060},
{0x1071, 0x1074},
{0x1082},
{0x1085, 0x1086},
{0x108D},
{0x109D},
{0x135D, 0x135F},
{0x1712, 0x1714},
{0x1732, 0x1733},
{0x1752, 0x1753},
{0x1772, 0x1773},
{0x17B4, 0x17B5},
{0x17B7, 0x17BD},
{0x17C6},
{0x17C9, 0x17D3},
{0x17DD},
-- Exclude Mongolian variation selectors.
{0x1885, 0x1886},
{0x18A9},
{0x1920, 0x1922},
{0x1927, 0x1928},
{0x1932},
{0x1939, 0x193B},
{0x1A17, 0x1A18},
{0x1A1B},
{0x1A56},
{0x1A58, 0x1A5E},
{0x1A60},
{0x1A62},
{0x1A65, 0x1A6C},
{0x1A73, 0x1A7C},
{0x1A7F},
{0x1AB0, 0x1ACE},
{0x1B00, 0x1B03},
{0x1B34},
{0x1B36, 0x1B3A},
{0x1B3C},
{0x1B42},
{0x1B6B, 0x1B73},
{0x1B80, 0x1B81},
{0x1BA2, 0x1BA5},
{0x1BA8, 0x1BA9},
{0x1BAB, 0x1BAD},
{0x1BE6},
{0x1BE8, 0x1BE9},
{0x1BED},
{0x1BEF, 0x1BF1},
{0x1C2C, 0x1C33},
{0x1C36, 0x1C37},
{0x1CD0, 0x1CD2},
{0x1CD4, 0x1CE0},
{0x1CE2, 0x1CE8},
{0x1CED},
{0x1CF4},
{0x1CF8, 0x1CF9},
{0x1DC0, 0x1DCC},
{0x1DCE, 0x1DFB},
{0x1DFD, 0x1DFF},
{0x20D0, 0x20F0},
{0x2CEF, 0x2CF1},
{0x2D7F},
{0x2DE0, 0x2DFF},
{0x302A, 0x302D},
{0x3099, 0x309A},
{0xA66F, 0xA672},
{0xA674, 0xA67D},
{0xA69E, 0xA69F},
{0xA6F0, 0xA6F1},
{0xA802},
{0xA806},
{0xA80B},
{0xA825, 0xA826},
{0xA82C},
{0xA8C4, 0xA8C5},
{0xA8E0, 0xA8F1},
{0xA8FF},
{0xA926, 0xA92D},
{0xA947, 0xA951},
{0xA980, 0xA982},
{0xA9B3},
{0xA9B6, 0xA9B9},
{0xA9BC, 0xA9BD},
{0xA9E5},
{0xAA29, 0xAA2E},
{0xAA31, 0xAA32},
{0xAA35, 0xAA36},
{0xAA43},
{0xAA4C},
{0xAA7C},
{0xAAB0},
{0xAAB2, 0xAAB4},
{0xAAB7, 0xAAB8},
{0xAABE, 0xAABF},
{0xAAC1},
{0xAAEC, 0xAAED},
{0xAAF6},
{0xABE5},
{0xABE8},
{0xABED},
{0xFB1E},
{0xFE20, 0xFE2F},
{0x101FD},
{0x102E0},
{0x10376, 0x1037A},
{0x10A01, 0x10A03},
{0x10A05, 0x10A06},
{0x10A0C, 0x10A0F},
{0x10A38, 0x10A3A},
{0x10A3F},
{0x10AE5, 0x10AE6},
{0x10D24, 0x10D27},
{0x10EAB, 0x10EAC},
{0x10EFD, 0x10EFF},
{0x10F46, 0x10F50},
{0x10F82, 0x10F85},
{0x11001},
{0x11038, 0x11046},
{0x11070},
{0x11073, 0x11074},
{0x1107F, 0x11081},
{0x110B3, 0x110B6},
{0x110B9, 0x110BA},
{0x110C2},
{0x11100, 0x11102},
{0x11127, 0x1112B},
{0x1112D, 0x11134},
{0x11173},
{0x11180, 0x11181},
{0x111B6, 0x111BE},
{0x111C9, 0x111CC},
{0x111CF},
{0x1122F, 0x11231},
{0x11234},
{0x11236, 0x11237},
{0x1123E},
{0x11241},
{0x112DF},
{0x112E3, 0x112EA},
{0x11300, 0x11301},
{0x1133B, 0x1133C},
{0x11340},
{0x11366, 0x1136C},
{0x11370, 0x11374},
{0x11438, 0x1143F},
{0x11442, 0x11444},
{0x11446},
{0x1145E},
{0x114B3, 0x114B8},
{0x114BA},
{0x114BF, 0x114C0},
{0x114C2, 0x114C3},
{0x115B2, 0x115B5},
{0x115BC, 0x115BD},
{0x115BF, 0x115C0},
{0x115DC, 0x115DD},
{0x11633, 0x1163A},
{0x1163D},
{0x1163F, 0x11640},
{0x116AB},
{0x116AD},
{0x116B0, 0x116B5},
{0x116B7},
{0x1171D, 0x1171F},
{0x11722, 0x11725},
{0x11727, 0x1172B},
{0x1182F, 0x11837},
{0x11839, 0x1183A},
{0x1193B, 0x1193C},
{0x1193E},
{0x11943},
{0x119D4, 0x119D7},
{0x119DA, 0x119DB},
{0x119E0},
{0x11A01, 0x11A0A},
{0x11A33, 0x11A38},
{0x11A3B, 0x11A3E},
{0x11A47},
{0x11A51, 0x11A56},
{0x11A59, 0x11A5B},
{0x11A8A, 0x11A96},
{0x11A98, 0x11A99},
{0x11C30, 0x11C36},
{0x11C38, 0x11C3D},
{0x11C3F},
{0x11C92, 0x11CA7},
{0x11CAA, 0x11CB0},
{0x11CB2, 0x11CB3},
{0x11CB5, 0x11CB6},
{0x11D31, 0x11D36},
{0x11D3A},
{0x11D3C, 0x11D3D},
{0x11D3F, 0x11D45},
{0x11D47},
{0x11D90, 0x11D91},
{0x11D95},
{0x11D97},
{0x11EF3, 0x11EF4},
{0x11F00, 0x11F01},
{0x11F36, 0x11F3A},
{0x11F40},
{0x11F42},
{0x13440},
{0x13447, 0x13455},
{0x16AF0, 0x16AF4},
{0x16B30, 0x16B36},
{0x16F4F},
{0x16F8F, 0x16F92},
-- Exclude Khitan Small Script filler
{0x1BC9D, 0x1BC9E},
{0x1CF00, 0x1CF2D},
{0x1CF30, 0x1CF46},
{0x1D167, 0x1D169},
{0x1D17B, 0x1D182},
{0x1D185, 0x1D18B},
{0x1D1AA, 0x1D1AD},
{0x1D242, 0x1D244},
{0x1DA00, 0x1DA36},
{0x1DA3B, 0x1DA6C},
{0x1DA75},
{0x1DA84},
{0x1DA9B, 0x1DA9F},
{0x1DAA1, 0x1DAAF},
{0x1E000, 0x1E006},
{0x1E008, 0x1E018},
{0x1E01B, 0x1E021},
{0x1E023, 0x1E024},
{0x1E026, 0x1E02A},
{0x1E08F},
{0x1E130, 0x1E136},
{0x1E2AE},
{0x1E2EC, 0x1E2EF},
{0x1E4EC, 0x1E4EF},
{0x1E8D0, 0x1E8D6},
{0x1E944, 0x1E94A}
},
double = {
{0x035C, 0x0362},
{0x1DCD},
{0x1DFC}
}
}
for i, set in pairs(comb_chars) do
for j, range in ipairs(set) do
for k, char in ipairs(range) do
range[k] = u(char)
end
set[j] = table.concat(range, "-")
end
comb_chars[i] = table.concat(set)
end
comb_chars.both = comb_chars.single .. comb_chars.double
comb_chars = {
combined_single = "[^" .. comb_chars.both .. "][" .. comb_chars.single .. "]+%f[^" .. comb_chars.both .. "]",
combined_double = "[^" .. comb_chars.both .. "][" .. comb_chars.single .. "]*[" .. comb_chars.double .. "]+[" .. comb_chars.both .. "]*.[" .. comb_chars.single .. "]*",
diacritics_single = "[" .. comb_chars.single .. "]+%f[^" .. comb_chars.both .. "]",
diacritics_double = "[" .. comb_chars.single .. "]*[" .. comb_chars.double .. "]+[" .. comb_chars.both .. "]*",
end_of_diacritics = "%f[^" .. comb_chars.both .. "]"
}
-- Get the list of unsupported titles and invert it (so the keys are pagenames and values are canonical titles).
local unsupported_titles = {}
for k, v in pairs(require("Module:links/data").unsupported_titles) do
unsupported_titles[v] = k
end
data.unsupported_titles = unsupported_titles
------ 3. Page-wide processing (so that it only needs to be done once per page). ------
--Get the pagename.
local pagename = mw.title.getCurrentTitle().subpageText
:gsub("^Unsupported titles/(.*)", function(m)
data.unsupported_title = true
return unsupported_titles[m] or m
end)
-- Save pagename, as local variable will be destructively modified.
data.pagename = pagename
-- Decompose the pagename in Unicode normalization form D.
data.decompose_pagename = mw.ustring.toNFD(pagename)
-- Explode the current page name into a character table, taking decomposed combining characters into account.
local explode_pagename = {}
local pagename_len = 0
local function explode(char)
explode_pagename[char] = true
pagename_len = pagename_len + 1
return ""
end
pagename = rsub(pagename, comb_chars.combined_double, explode)
pagename = rsub(pagename, comb_chars.combined_single, explode)
:gsub("[%z\1-\127\194-\244][\128-\191]*", explode)
data.comb_chars = comb_chars
data.explode_pagename = explode_pagename
data.pagename_len = pagename_len
-- Generate DEFAULTSORT.
data.encoded_pagename = mw.text.encode(data.pagename)
data.pagename_defaultsort = require("Module:languages").getByCode("mul"):makeSortKey(data.encoded_pagename)
mw.getCurrentFrame():callParserFunction(
"DEFAULTSORT",
data.pagename_defaultsort
)
------ 4. Parse page for maintenance categories. ------
-- Manual use of {{DEFAULTSORT:}} and aliases.
local content = mw.title.getCurrentTitle():getContent()
local defaultsort = {
["SORT"] = true,
["SORTKEY"] = true,
["CATEGORYSORT"] = true
}
for magic_word in content:gmatch("{{%s*DEFAULT(.-):.-}}") do
if defaultsort[magic_word] then
data.pagename_defaultsort_conflict = frame:expandTemplate{
title = "tracking category",
args = {"Pages with DEFAULTSORT conflicts"}
}
break
end
end
-- Raw wikitext use of {{DISPLAYTITLE:}}.
if content:find("{{%s*DISPLAYTITLE:.-}}") then
data.pagename_displaytitle_conflict = frame:expandTemplate{
title = "tracking category",
args = {"Pages with DISPLAYTITLE conflicts"}
}
end
-- Manual categories.
if content:find("%[%[Category:") then
track("manual categories")
end
-- Manual "terms spelled with" categories.
if content:find("terms spelled with") then
track("terms-spelled-with")
end
-- Raw wikitext use of a topic or langname category.
local wikitext_topic_cat = {}
local wikitext_langname_cat = {}
for prefix, cat in content:gmatch("%[%[[ _]*[Cc][Aa][Tt](%a-)[ _]*:[ _]*(.-)[ _]*%]%]") do
if prefix == "" or prefix:lower() == "egory" then
local code = cat:match("^([%w%-.]+):")
if code then
wikitext_topic_cat[code] = true
else
cat = cat:gsub("|.*", "")
:gsub("[ _]+", " ")
local n, name = cat:find(".%f[%z _]")
while n do
name = cat:sub(1, n)
wikitext_langname_cat[name] = true
n = cat:find(".%f[%z _]", n + 1)
end
cat = cat:reverse()
n = cat:find(".%f[%z _]")
while n do
name = cat:sub(1, n):reverse()
wikitext_langname_cat[name] = true
n = cat:find(".%f[%z _]", n + 1)
end
end
end
end
data.wikitext_topic_cat = wikitext_topic_cat
data.wikitext_langname_cat = wikitext_langname_cat
return data