Module:parser
From SUALEX
Documentation for this module may be created at Module:parser/doc
-- Module:parser
local utilities = require("Module:utilities")
local parameters = require("Module:parameters")
local parser = {}
function parser.parse_term(value)
if not value or value == "" then return {} end
-- allow escaped colons inside term values
local parts = utilities.split_string(value, ":")
local term_item = parts[1] or ""
local display_form = parts[2]
local homonym = term_item:match("#(.+)$")
term_item = term_item:gsub("#.+$", "")
return {term_item = term_item, homonym = homonym, display_form = display_form}
end
function parser.parse_spelling(value)
if not value or value == "" then return {} end
local parts = utilities.split_string(value, ":")
return {spelling_item = parts[1], orthography = parts[2]}
end
function parser.parse_gloss(value)
if not value or value == "" then return {} end
-- capture trailing #... as sup (e.g. "?" from "#?")
local sup = value:match("#([^#]+)$")
local raw = value:gsub("#([^#]+)$", "")
return {gloss_item = raw, sup = sup}
end
function parser.parse_reference(value)
if not value or value == "" then return {} end
local parts = utilities.split_string(value, ":")
return {author = parts[1], year = parts[2], page = parts[3]}
end
-- etymon format: TYPE:lang?:items
-- items separated by + ; language is sticky (set once at start or overridden per item)
-- supports escaped colons (\:) inside item forms
function parser.parse_etymon(value)
if not value or value == "" then return {} end
value = utilities.trim_string(value)
-- escape literal \: so subsequent plain find() works on unescaped colons
local escaped = utilities.escape_colons_for_split(value)
local colon_pos = escaped:find(":", 1, true)
if not colon_pos then return {} end
-- raw type (unescaped); strip global "#?" only from the type and set parsed.uncertainty
local raw_type = utilities.trim_string(
utilities.unescape_colons_after_split(escaped:sub(1, colon_pos - 1))
)
local etymon_uncertainty = false
if raw_type:match("#%?$") then
etymon_uncertainty = true
raw_type = raw_type:gsub("#%?$", "")
end
local descendant_type = raw_type
local sub_escaped = escaped:sub(colon_pos + 1)
-- detect an initial language token at the start (e.g. "bew:..."); must be an unescaped colon
local initial_lang = nil
local init_col = sub_escaped:find(":", 1, true)
if init_col then
local maybe_lang = sub_escaped:sub(1, init_col - 1)
-- allow alphanumerics and hyphen in language codes
if maybe_lang:match("^[%w%-]+$") then
initial_lang = utilities.unescape_colons_after_split(maybe_lang)
sub_escaped = sub_escaped:sub(init_col + 1)
end
end
-- split on '+' (items) in the escaped string so literal \: is preserved
local item_strings_escaped = {}
for item in (sub_escaped .. parameters.etymon_item_separator):gmatch("(.-)" .. parameters.etymon_item_separator) do
local trimmed = utilities.trim_string(item)
if trimmed ~= "" then table.insert(item_strings_escaped, trimmed) end
end
local parsed_items = {}
local current_lang = initial_lang -- sticky across + items
for _, esc_item in ipairs(item_strings_escaped) do
local item_escaped = utilities.trim_string(esc_item)
-- see if this item explicitly overrides language: look for the first unescaped ':'
local override_lang = nil
local colon_local = item_escaped:find(":", 1, true)
local body_escaped = item_escaped
if colon_local then
local maybe_lang = item_escaped:sub(1, colon_local - 1)
if maybe_lang:match("^[%w%-]+$") then
override_lang = utilities.unescape_colons_after_split(maybe_lang)
body_escaped = item_escaped:sub(colon_local + 1)
end
end
if override_lang then current_lang = override_lang end
-- now unescape the item body so we can parse #... and display parts
local item_str = utilities.unescape_colons_after_split(body_escaped)
item_str = utilities.trim_string(item_str)
local sup = nil
local homonym = nil
-- detect trailing #... (could be two '#'): last = sup or maybe homonym
local last = item_str:match("#([^#]+)$")
if last then
local before_last = item_str:sub(1, -(#last) - 2) -- remove '#...' suffix
local second = before_last:match("#([^#]+)$")
if second then
-- two hashes: second = homonym, last = sup
homonym = second
sup = last
item_str = before_last:sub(1, -(#second) - 2)
else
-- single '#': either sup if '?' else homonym
if last == "?" then
sup = last
item_str = before_last
else
homonym = last
item_str = before_last
end
end
end
item_str = utilities.trim_string(item_str)
-- the remaining item_str may contain a colon separating etymon_item and display_form; use split_string to honor any escaped colons
local parts = utilities.split_string(item_str, ":")
local etym_item = parts[1] or ""
local display_form = parts[2]
-- leftover homonym inside etym_item (#N)
local leftover_homonym = etym_item:match("#(.+)$")
if leftover_homonym and not homonym then
homonym = leftover_homonym
end
etym_item = etym_item:gsub("#.+$", "")
local lang_code_final = current_lang or "unknown"
local parsed_item = {
lang_code = lang_code_final,
etymon_item = etym_item,
homonym = homonym,
display_form = display_form,
sup = sup, -- keep item-level sup for rendering only
}
table.insert(parsed_items, parsed_item)
end
return {
descendant_type = descendant_type,
items = parsed_items,
uncertainty = etymon_uncertainty
}
end
-- parse a full entry row; allow old format (single gloss) and new format (gloss_en, gloss_es)
function parser.parse_entry(entry, lang_code)
local fields = {"term", "spelling", "gloss_en", "gloss_es", "reference", "etymon", "notes"}
local parsed = {lang_code = lang_code}
for i, field in ipairs(fields) do
local raw = entry[i] or ""
local sep = parameters.value_separator or "\\"
local values = utilities.split_string(raw, sep)
parsed[field] = {}
for _, val in ipairs(values) do
local trimmed = utilities.trim_string(val)
if trimmed ~= "" then
if field:match("^gloss") then
table.insert(parsed[field], parser.parse_gloss(trimmed))
elseif field == "term" then
table.insert(parsed[field], parser.parse_term(trimmed))
elseif field == "spelling" then
table.insert(parsed[field], parser.parse_spelling(trimmed))
elseif field == "reference" then
table.insert(parsed[field], parser.parse_reference(trimmed))
elseif field == "etymon" then
table.insert(parsed[field], parser.parse_etymon(trimmed))
else
table.insert(parsed[field], { raw = trimmed })
end
end
end
end
-- backward compatibility: if gloss_en empty but entry[3] existed in old format, populate gloss_en from that
if #parsed.gloss_en == 0 and entry[3] and entry[3] ~= "" then
table.insert(parsed.gloss_en, parser.parse_gloss(entry[3]))
end
return parsed
end
return parser