Модуль:zh-usex
Documentation for this module may be created at Модуль:zh-usex/doc
local export = {}
local m_zh = require("Module:zh")
local m_languages = require("Module:languages")
local m_string_utils = require("Module:string utilities")
local find = m_string_utils.find
local gsub = m_string_utils.gsub
local match = m_string_utils.match
local split = m_string_utils.split
local sub = m_string_utils.sub
local upper = m_string_utils.upper
-- Use this when the actual title needs to be known.
local actual_title = mw.title.getCurrentTitle()
-- Use this when testcases need to be able to override the title (for bolding,
-- for instance).
local title = actual_title
local PAGENAME = PAGENAME or title.text
local data = mw.loadData("Module:zh-usex/data")
local punctuation = data.punctuation
local ref_list = data.ref_list
local pron_correction = data.pron_correction
local polysyllable_pron_correction = data.polysyllable_pron_correction
local zh_format_end = "</span>"
--local Han_pattern = "[" .. require("Module:scripts").getByCode("Hani"):getCharacters() .. "]"
local Han_pattern = "[一-鿿㐀-䶿﨎﨏﨑﨓﨔﨟﨡﨣﨤﨧-﨩𠀀-𪛟𪜀-𰀀-𲎯]"
local UTF8_char = '[^\128-\191][\128-\191]*'
local UTF8_char2 = '[^[%]\128-\191][\128-\191]*' -- not "[" or "]"
local tr_data = {
cmn = {
segment_c = " %-",
separator_conv = {[""]="",[" "]=" ",["-"]="",["--"]="-"},
link_ignore = "\1.^",
tr_cap = true,
combine = function(t)
return t:gsub("^%f[aoe\195-\199]","\3") -- temporary substitute of the apostrophe
end,
},
yue = {
segment_c = " ",
separator_conv = {[""]="",[" "]=" "},
link_ignore = "\1",
tr_cap = false,
combine = function(t) return t:gsub(",.+","") end,
},
["nan-hbl"] = {
segment_c = " ~",
separator_conv = {[""]="",[" "]=" ",["~"]="-"},
link_ignore = "\1%%.^",
tr_cap = true,
combine = function(t) return "-"..t:gsub("/.+","") end,
},
hak = {
segment_c = " ~",
separator_conv = {[""]="",[" "]=" ",["~"]="-"},
link_ignore = "\1.^",
tr_cap = true,
combine = function(t) return "-"..t:gsub("/.+","") end,
},
default = {
segment_c = " ",
separator_conv = {[""]="",[" "]=" "},
link_ignore = "\1",
tr_cap = false,
},
}
local function get_tr(display, norm_code)
local given, given_pos = {}, 1 -- record the characters with given transcription
local punc, punc_pos = {}, 1 -- record the punctuations with given transcription
local tr_datapoint = tr_data[norm_code]
local word_regex = "[^"..tr_datapoint.link_ignore.." \2{}".."]+" -- regex that matches words
local tr_word = display:gsub("\1", " ")
:gsub("%["..UTF8_char2.."%]", "")
:gsub("("..UTF8_char.."){([^{}]*)}", function(a,b) -- record given tr and replace with "{"
given[given_pos] = a:find("^%w$") and b or tr_datapoint.combine(b)
given_pos = given_pos + 1
return "{"
end)
:gsub("%f[^ ][^ ]+%f[ ]", function(a) -- record punctuation and replace with "}"
if punctuation[a] then
punc[punc_pos] = punctuation[a]
punc_pos = punc_pos + 1
return "}"
end
return a
end)
:gsub("<b>","\1"):gsub("</b>","\2") -- substitute bold tags for further processing
:gsub(word_regex,function(word)
-- first attempt to get the pronunciation of the whole word
local res = polysyllable_pron_correction[norm_code][word]
or pron_correction[norm_code][word]
if res then return res end
local length = 0 -- for check_pron (a bit hacky because check_pron only checks if length == 1)
if word:find("^"..UTF8_char.."$") then length = 1 end
res = m_zh.check_pron(word, norm_code, length, true)
if res then return tr_datapoint.combine(res) end
-- if it fails, get pronunciation of each character
return word:gsub(UTF8_char, function(ch)
local ch_res = pron_correction[norm_code][ch]
if ch_res then return ch_res end
ch_res = m_zh.check_pron(ch, norm_code, 1, true)
return ch_res and tr_datapoint.combine(ch_res) or ch
end)
end)
if norm_code == "cmn" then
tr_word = tr_word:gsub("%.%.","-")
end
if norm_code ~= "yue" then
tr_word = tr_word:gsub("%."," ")
end
given_pos, punc_pos = 0,0
tr_word = tr_word:gsub("{",function() -- substitute back the stored results
given_pos = given_pos + 1
return given[given_pos]
end)
:gsub("}",function() -- substitute back the punctuations
punc_pos = punc_pos + 1
return punc[punc_pos]
end)
if norm_code == "yue" then
tr_word = tr_word:gsub("[a-z][1-9]%-?[1-9]?", "%0 ")
end
return tr_word
end
local function make_link(target, display)
target = target == "" and display or target
-- Remove bold tags from target
target = target:gsub("</?b>","")
-- Generate link to Chinese section
local result = "[[" .. target .. "#Chinese|" .. display .. "]]"
-- For debugging purposes
--if actual_title.nsText == "Module" then mw.log(display, target, "->", result) end
return result
end
local function convert(conv_fun, text)
return (text .. "A[A]")
:gsub("([^%[%]]*)"..UTF8_char2.."%[("..UTF8_char2..")%]",
function(a,b) return conv_fun(a)..b end)
:sub(1,-2)
end
function export.show(frame)
local params = {
[1] = { required = true }, -- example
[2] = {}, -- translation
[3] = {}, -- variety
lit = {},
tr = {},
ref = {}, r = { alias_of = "ref" },
inline = {},
audio = {}, a = { alias_of = "audio" },
collapsed = { type = "boolean" },
-- Allow specifying pagename in testcases on documentation page.
pagename = actual_title.nsText == "Template" and {} or nil,
nocat = { type = "boolean" },
tr_nocap = { type = "boolean" },
simp = { type = "boolean" }
}
local category = frame.args["category"] or error("Please specify the category.")
local args, unrecognized_args = require("Module:parameters").process(frame:getParent().args, params, true)
if args.pagename then
-- Override title in Module namespace.
title = mw.title.new(args.pagename)
PAGENAME = title.text
end
local example = args[1] or error("Example unspecified.")
local translation = args[2]
local literal = args["lit"]
local reference = args["ref"]
local manual_tr = args["tr"]
local display = args["display_type"]
local inline = args["inline"]
local audio_file = args["audio"]
local collapsed = args["collapsed"]
local simp = args["simp"]
local phonetic = ""
local original_length = example:gsub("[^\194-\244]+",""):len()
local variety = args[3] or frame.args["variety"] or (ref_list[reference] and ref_list[reference][1] or false) or "cmn"
local variety_data = data.varieties_by_code[variety] or data.varieties_by_old_code[variety] or error("Variety " .. variety .. " not recognized.")
-- unpack() doesn't work here because the data was loaded using mw.loadData()
local std_code, norm_code, desc, tr_desc = variety_data[2], variety_data[3], variety_data[4], variety_data[5]
norm_code = norm_code or std_code
variety = std_code
local lang_obj_wikt = m_languages.getByCode(variety, 3, "allow etym")
if next(unrecognized_args) then
--[[Special:WhatLinksHere/Wiktionary:Tracking/zh-usex/unrecognized arg]]
require("Module:debug").track_unrecognized_args(unrecognized_args, "zh-usex")
end
if reference then
require("Module:debug").track("zh-usex/ref")
end
if example:find("[%(%)]") then
require("Module:debug").track("zh-usex/parentheses")
end
if example:find("&#") then
require("Module:debug").track("zh-usex/html")
end
-- future escape character?
if example:find("`") then
require("Module:debug").track("zh-usex/backtick")
end
if example:find(" ") then
require("Module:debug").track("zh-usex/double-space")
end
if (norm_code == "nan-hbl" or norm_code:find("^hak")) and example:find("%-") then
require("Module:debug").track("zh-usex/hyphen")
end
if example:find("%w%{") then
require("Module:debug").track("zh-usex/rom-text")
end
if not translation or translation == '' then -- per standard [[Module:usex]]
translation = '<small>(please add an English translation of this ' .. (category == "quotations" and "quotation" or "usage example") .. ')</small> [[Category:Requests for translations of ' .. lang_obj_wikt:getFullName() .. ' ' .. (category == "collocations" and "usage examples" or category) .. ']]'
end
-- should we generate the other (simp/trad) form
-- (in the end, only actually display if the converted text is different)
local do_conv = true
if norm_code == "vi" or norm_code == "ko" then
do_conv = false
end
local conv_fun = m_zh.ts
if simp then
if category ~= "quotations" then error("parameter simp cannot be true in [[Template:zh-x]] or [[Template:zh-co]].") end
if norm_code == "vi" or norm_code == "ko" or norm_code == "lzh" or variety == "yue-HK" or variety == "cmn-TW" or
variety == "nan-hbl-TW" or variety == "lzh-cmn-TW" or variety == "hak-hai" or variety == "hak-dab" or
variety == "hak-zha" then
error(("Parameter simp= cannot be specified for variety '%s'"):format(variety))
end
conv_fun = m_zh.st
end
-- should we generate the transcription
local generate_tr = false
if tr_data[norm_code] then
if manual_tr then
require("Module:debug").track("zh-usex/manual-tr")
else
generate_tr = true
end
end
local boldify = false
-- automatically boldify pagetitle if nothing is in bold
if not example:find("'''") and not punctuation[PAGENAME] then
boldify = true
end
-- tidying up the example, making it ready for transcription
example = gsub(example, "[?!,。、“”…;:‘’|()「」『』—《》〈〉【】· .~]", " %0 ")
example = example:gsub("— —", "——") -- double em-dash (to be converted to single em-dash later)
:gsub("<br */?>"," <br> ") -- process linebreaks
:gsub("^ *",""):gsub(" *$",""):gsub(" +"," ") -- process spaces
:gsub("%[%[(.-)%]%]%f[^%]]",function(a) -- process [[]]
return a:gsub(" ","\1")
end)
:gsub("'''([^']+)'''", "<b>%1</b>") -- normalise bold syntax
:gsub("%^<b>","<b>^")
:gsub("</b>(%["..UTF8_char2.."%])","%1</b>")
:gsub("</b>({[^{}]*})","%1</b>")
-- parsing: convert "-", "--", "---" to "-", "..", "--" respectively
-- so that "-" is the character that delimits links
-- further explanation will use the replacement result to refer to the commands
if norm_code == "cmn" then
example = example:gsub("%-+",{["--"]="..",["---"]="--"})
if example:find("%-[^%-%s]+\\") then
require("Module:debug").track("zh-usex/extra-pinyin")
end
end
local regex_data = tr_data[norm_code] or tr_data.default
local segment_c = regex_data.segment_c -- the characters that delimit links
local separator_conv = regex_data.separator_conv -- the table for separator mapping
local link_ignore = regex_data.link_ignore -- the characters that do not affect links
local tr_cap = regex_data.tr_cap -- transliteration can be capitalised
local segment_regex = "(["..segment_c.."]*)([^"..segment_c.."]+)" -- the regex that matches each segment and the separator before it
local cache = {} -- store the result of each segment
local trad_text = ""
local simp_text = ""
-- generate the transliteration
-- but store the results in the cache
-- and also build up trad_text and simp_text
local tr_text = example:gsub(segment_regex, function(separator,seg)
separator = separator_conv[separator] or error('Invalid separator: "'..separator..'"')
if cache[seg] then
trad_text = trad_text .. cache[seg].trad
simp_text = simp_text .. cache[seg].simp
return separator..cache[seg].tr
end
if punctuation[seg] then
cache[seg] = {
trad = seg,
simp = seg,
tr = punctuation[seg]
}
trad_text = trad_text .. seg
simp_text = simp_text .. seg
return separator..punctuation[seg]
end
local generate_link = 0
seg, generate_link = seg:gsub("@","")
generate_link = (generate_link == 0)
local target, display = "", seg
local pos = seg:find("\\",1,true)
if generate_link and pos then
-- move formatting from start of target to display
-- e.g. <b>^甲\乙 --> 甲\<b>^乙
local bold = ""
local caret = ""
local start = 1
if seg:sub(1,3) == "<b>" then
bold,start = "<b>",4
end
if tr_cap and seg:sub(start) == "^" then
caret,start = "^",start+1
end
target, display = seg:sub(start,pos-1), bold..caret..seg:sub(pos+1,-1)
if target:find("</?b>") then -- Check for bold tags in target.
require("Module:debug").track("zh-usex/bold-target")
end
end
target = target:gsub("\1","")
local target_trad = target:gsub("%["..UTF8_char2.."%]","")
local target_simp = do_conv and convert(conv_fun, target)
local occurrences = 0
if boldify then
display, occurrences = display:gsub(PAGENAME,"<b>"..PAGENAME.."</b>")
end
if occurrences > 0 then
display = display:gsub("%[<b>"..PAGENAME.."</b>%]","%["..PAGENAME.."%]")
:gsub("%^<b>","<b>^")
:gsub("</b>(%["..UTF8_char2.."%])","%1</b>")
:gsub("</b>({[^{}]*})","%1</b>")
end
local display_derom = display:gsub("{[^{}]*}","")
:gsub("["..link_ignore.."]+","")
local display_trad = display_derom:gsub("%["..UTF8_char2.."%]","")
local display_simp = do_conv and convert(conv_fun, display_derom) or ""
local seg_tr = generate_tr and get_tr(display, norm_code) or ""
if (simp and display_simp or display_trad):gsub("</?b>","") == PAGENAME
or (simp and target_simp or target_trad) == PAGENAME then
generate_link = false
if boldify and occurrences == 0 then
display_trad = "<b>" .. display_trad .. "</b>"
display_simp = "<b>" .. display_simp .. "</b>"
seg_tr = "\1" .. seg_tr .. "\2"
end
end
local seg_trad = generate_link and make_link(target_trad, display_trad) or display_trad
local seg_simp = generate_link and do_conv and make_link(target_simp, display_simp) or display_simp
cache[seg] = {
trad = seg_trad,
simp = seg_simp,
tr = seg_tr
}
trad_text = trad_text .. seg_trad
simp_text = simp_text .. seg_simp
return separator..seg_tr
end)
if trad_text == simp_text then
do_conv = false
simp_text = nil
end
if not trad_text:find("</?b>") then
require("Module:debug").track("zh-usex/no-bold")
end
-- format generated tr
-- at this point we have three temporary substitutions:
-- <b>:\1, </b>:\2, ':\3
if generate_tr then
if norm_code == "cmn" then -- format apostrophe
tr_text = tr_text
:gsub("%f[^%z -]([\1\2^]*)\3", "%1")
:gsub("\1\3","\3\1") -- <b>' → '<b>
:gsub("^\3","\3^") -- ^' → '^ (shouldn't occur)
elseif norm_code == "nan-hbl" or norm_code == "hak" then -- format hyphens
tr_text = tr_text
:gsub("%^%-","-^")
:gsub("\1%-","-\1") -- <b>- → -<b>
:gsub("%-\2","\2-") -- -</b> → </b>-
:gsub("%f[^%z ]%-%f[^%z %-]","") -- "-chhek" at beginning -> "chhek"
:gsub("%f[%z %-]%-%f[%z ]","") -- "shi-" at the end -> "shi"
:gsub("%-+","-")
:gsub("%-?%%%-?", "--")
end
tr_text = tr_text:gsub("[\1\2\3]",{["\1"]="<b>",["\2"]="</b>",["\3"]="'"})
if find(tr_text, Han_pattern) then
require("Module:debug").track("zh-usex/character without transliteration")
end
end
local tag_start = " <span style=\"color:darkgreen; font-size:x-small;\">[" -- HTML entity since "[[[w:MSC|MSC]]" is interpreted poorly
local tag_end = "]</span>"
local simp_link = "<i>[[w:Simplified Chinese|simp.]]</i>"
local trad_link = "<i>[[w:Traditional Chinese|trad.]]</i>"
if simp then
simp_link, trad_link = trad_link, simp_link
end
trad_text, auto_spaces = trad_text:gsub("([a-zA-Z]%]%])(%[%[[a-zA-Z])", "%1 %2")
simp_text = do_conv and simp_text:gsub("([a-zA-Z]%]%])(%[%[[a-zA-Z])", "%1 %2") or false
phonetic = manual_tr or (generate_tr and tr_text)
if auto_spaces > 0 then
require("Module:debug").track("zh-usex/auto-spaces")
end
-- overall transcription formatting
if phonetic then
phonetic = gsub(phonetic, " </b>", "</b> ")
if norm_code ~= "wuu" then
phonetic = gsub(phonetic, " ", " ")
end
if norm_code == "yue" or norm_code == "zhx-tai" or norm_code == "csp" or norm_code == "nan-tws" or norm_code == "nan-hnm" or
norm_code == "zhx-sic" or norm_code == "cjy" or norm_code == "hsn" or norm_code == "gan" or
norm_code == "yue-dgx" or norm_code == "yue-yjx" or norm_code == "yue-ylx" or
variety == "hak-mei" then
phonetic = gsub(phonetic, "([a-zê]+)([1-9%-]+)", "%1<sup>%2</sup>") -- superscript tones
end
phonetic = gsub(phonetic, " ([,%.?!;:’”)])", "%1") -- remove excess spaces from punctiation
phonetic = gsub(phonetic, "([‘“(]) ", "%1")
phonetic = phonetic:gsub(" <br> ", "<br>")
if not manual_tr then
if norm_code == "nan-hbl" then
phonetic = gsub(phonetic, " +%-%-", "--")
end
end
-- capitalisation
if not manual_tr then
if norm_code == "yue" or norm_code == "zhx-tai" or norm_code == "cjy" or norm_code == "hsn" or
norm_code == "cmn-wuh" or norm_code == "nan-tws" or norm_code == "wxa" or norm_code == "wuu" or
variety == "hak-mei" then
args.tr_nocap = true
end
if not args.tr_nocap and match(example, "[。?!]") then
phonetic = "^" .. gsub(phonetic, "([%.?!]) ", "%1 ^")
end
if not args.tr_nocap then
phonetic = gsub(phonetic, "([%.%?%!][”’]) (.)", "%1 ^%2")
phonetic = gsub(phonetic, "<br>(.)", "<br>^%1")
phonetic = gsub(phonetic, ": ([“‘])(.)", ": %1^%2")
end
phonetic = gsub(phonetic, "%^<b>", "<b>^")
phonetic = gsub(phonetic, "%^+.", upper)
phonetic = gsub(phonetic, "%^", "")
end
if norm_code == "wuu" then
local wuu_pron = require("Module:wuu-pron")
if phonetic:find(":") then
phonetic = phonetic:sub(4)
end
phonetic = "''" .. wuu_pron.wugniu_format(phonetic) .. "''"
elseif norm_code == "cmn-wuh" or norm_code == "wxa" then
phonetic = "<span class=\"IPA\">[" .. phonetic .. "]</span>"
elseif norm_code == "cdo" then
local cdo_pron = require("Module:cdo-pron")
phonetic = "<i>" .. phonetic .. "</i>" ..
(not match(phonetic, "-[^ ]+-[^ ]+-[^ ]+-")
and " / <span class=\"IPA\"><small>[" .. cdo_pron.sentence(phonetic) .. "]</small></span>"
or "")
else
phonetic = "<i>" .. phonetic .. "</i>"
end
phonetic = "<span lang=\"zh-Latn\" style=\"color:#404D52\">" .. phonetic .. "</span>"
end
local collapse_start, collapse_end, collapse_tag, collapse_border_div, collapse_border_div_end = '', '', '', '', ''
simplified_start = '<br>'
if collapsed then
collapse_start = '<span class="vsHide">'
collapse_end = '</span>'
collapse_tag = '<span class="vsToggleElement" style="color:darkgreen;padding-left:10px"></span>'
collapse_border_div = '<div class="vsSwitcher" data-toggle-category="usage examples" style="border-left: 1px solid #930; border-left-width: 2px; padding-left: 0.8em;">'
collapse_border_div_end = '</div>'
simplified_start = '<hr>'
end
if actual_title.nsText == '' and (not args.nocat) then -- fixme: probably categorize only if text contains the actual word
if reference then
cat = "[[Category:" .. lang_obj_wikt:getFullName() .. " terms with quotations]]"
else
cat = "[[Category:" .. lang_obj_wikt:getFullName() .. " terms with " .. category .. "]]"
end
end
local zh_format_start_simp = "<span lang=\"zh-Hans\" class=\"Hans\">"
local zh_format_start_trad = "<span lang=\"zh-Hant\" class=\"Hant\">"
if simp then zh_format_start_simp, zh_format_start_trad = zh_format_start_trad, zh_format_start_simp end
-- indentation, font and identity tags
if ((norm_code == "cmn" and original_length > 7)
or (norm_code ~= "cmn" and original_length > 5)
or reference
or collapsed
or (match(example, "[,。?!、:; ]") and norm_code == "wuu")
or (norm_code == "cdo" and original_length > 3)
or (inline or "" ~= "")) then
trad_text = zh_format_start_trad .. trad_text .. zh_format_end
if not phonetic then
translation = "<i>" .. translation .. "</i>"
end
if phonetic then
phonetic = "<dd>" .. collapse_start .. phonetic
translation = "<dd>" .. translation .. "</dd>"
tr_tag = tag_start .. tr_desc .. tag_end .. collapse_end .. "</dd>"
else
translation = "<dd>" .. translation .. "</dd>"
end
if audio_file then
audio = "<dd>[[File:" .. audio_file .. "]]</dd>"
end
if do_conv then
trad_tag = collapse_start .. tag_start .. desc .. ", " .. trad_link .. tag_end .. collapse_end .. collapse_tag
simp_text = simplified_start .. collapse_start .. zh_format_start_simp .. simp_text .. zh_format_end
simp_tag = tag_start .. desc .. ", " .. simp_link .. tag_end .. collapse_end
elseif norm_code == "vi" or norm_code == "ko" then
trad_tag = collapse_start .. tag_start .. desc ..", " .. trad_link .. tag_end .. collapse_end .. collapse_tag
else
trad_tag = collapse_start .. tag_start .. desc ..", " .. trad_link .. " and " .. simp_link .. tag_end .. collapse_end .. collapse_tag
end
if reference then
reference = "<dd>" .. collapse_start .. "<small><i>From:</i> " ..
(ref_list[reference] and ref_list[reference][2] or reference) .. "</small>" .. collapse_end .. "</dd>"
end
return collapse_border_div .. "<dl class=\"zhusex\">" .. trad_text .. trad_tag .. (simp_text or "") .. (simp_tag or "") .. (reference or "") ..
(phonetic and phonetic .. tr_tag or "") .. (audio or "") .. translation .. "</dl>" .. (cat or "") .. collapse_border_div_end
else
trad_text = zh_format_start_trad .. trad_text .. zh_format_end
divider = " ― "
if variety ~= "cmn" then
ts_tag = tag_start .. desc .. tag_end
tr_tag = tag_start .. tr_desc .. tag_end
end
if not phonetic then
translation = "<i>" .. translation .. "</i>"
end
if do_conv then
simp_text = "<span lang=\"zh-Hani\" class=\"Hani\">/</span>" .. zh_format_start_simp .. simp_text .. zh_format_end
end
if audio_file then
audio = " [[File:" .. audio_file .. "]]"
end
return trad_text .. (simp_text or "") .. (ts_tag or "") .. divider ..
(phonetic and phonetic .. (tr_tag or "") .. (audio or "") .. divider or "") .. translation .. (literal and " (literally, “" .. literal .. "”)" or "") ..
(cat or "")
end
end
-- function export.migrate(text, translation, ref)
-- if type(text) == "table" then
-- if not text.args or not text.args[1] then
-- text = text:getParent()
-- end
-- if text.args[2] and text.args[2] ~= '' then
-- ref = text.args[1]
-- translation = text.args[3]
-- text = text.args[2]
-- else
-- text = text.args[1]
-- end
-- end
-- text = text:gsub('^[%*#: \n]+', ''):gsub('[ \n]+$', ''):gsub(' +', ' '):gsub('\n+', '<br>'):gsub('|', '\\'):gsub('\'\'\'%[%[', ' '):gsub('%]%]\'\'\'', ' '):gsub('%]%]%[%[', ' '):gsub('%]%]', ''):gsub('%[%[', '')
-- :gsub('\'\'\'', ''):gsub(',', ','):gsub('!', '!'):gsub('%?', '?')
-- if translation then
-- if ref and ref ~= '' then
-- return '{{zh-x|' .. text .. '|' .. translation .. '|ref=' .. ref .. '}}'
-- else
-- return '{{zh-x|' .. text .. '|' .. translation .. '}}'
-- end
-- else
-- return text
-- end
-- end
return export