Module:parser
From SUALEX
Documentation for this module may be created at Module:parser/doc
-- Module:parser
local utilities = require("Module:utilities")
local parser = {}
function parser.parse_term(value)
if not value or value == "" then return {} end
local parts = utilities.split_string(value, ":")
local term_item = parts[1] or ""
local display_form = parts[2]
-- capture whatever follows # as homonym (string), e.g. "1" for "#1"
local homonym = term_item:match("#(.+)$")
term_item = term_item:gsub("#.+$", "")
return {term_item = term_item, homonym = homonym, display_form = display_form}
end
function parser.parse_spelling(value)
if not value or value == "" then return {} end
local parts = utilities.split_string(value, ":")
return {spelling_item = parts[1], orthography = parts[2]}
end
function parser.parse_gloss(value)
if not value or value == "" then return {} end
-- capture trailing #... as sup (e.g. "?" from "#?")
local sup = value:match("#(.+)$")
value = value:gsub("#.+$", "")
return {gloss_item = value, sup = sup}
end
function parser.parse_reference(value)
if not value or value == "" then return {} end
local parts = utilities.split_string(value, ":")
return {author = parts[1], year = parts[2], page = parts[3]}
end
function parser.parse_etymon(value)
if not value or value == "" then return {} end
value = utilities.trim_string(value)
local colon_pos = value:find(":", 1, true)
if not colon_pos then return {} end
local descendant_type = utilities.trim_string(value:sub(1, colon_pos - 1))
local global_uncertainty = descendant_type:match("#?%?$") and true or false
descendant_type = descendant_type:gsub("#?%?$", "")
local sub = value:sub(colon_pos + 1)
-- try to capture an initial language token at the start (e.g. "bew:")
local initial_lang = sub:match("^([a-z%-]+):")
if initial_lang then
sub = sub:gsub("^" .. initial_lang:gsub("%-", "%%-") .. ":", "", 1)
else
initial_lang = nil
end
sub = utilities.trim_string(sub)
-- split on '+' (items); preserve order
local item_strings = {}
for item in (sub .. "+"):gmatch("(.-)%+") do
local trimmed = utilities.trim_string(item)
if trimmed ~= "" then table.insert(item_strings, trimmed) end
end
local parsed_items = {}
local current_lang = initial_lang
for _, raw_item in ipairs(item_strings) do
local item_str = utilities.trim_string(raw_item)
-- handle optional leading "lang:" inside this item (overrides current_lang)
local lang_match = item_str:match("^([a-z%-]+):")
if lang_match then
current_lang = lang_match
item_str = item_str:gsub("^" .. lang_match:gsub("%-", "%%-") .. ":", "", 1)
end
if not current_lang then current_lang = "unknown" end
local sup = nil
local homonym = nil
local last = item_str:match("#([^#]+)$")
if last then
local before_last = item_str:sub(1, -(#last) - 2) -- remove '#' and last content
local second = before_last:match("#([^#]+)$")
if second then
-- two hashes: second = homonym, last = sup
homonym = second
sup = last
-- now remove the second '#...' from before_last to get base_and_rest
item_str = before_last:sub(1, -(#second) - 2)
else
-- only one '#': decide whether it's homonym or sup
if last == "?" then
sup = last
item_str = before_last -- base without final #?
else
homonym = last
item_str = before_last -- base without final #N
end
end
end
item_str = utilities.trim_string(item_str)
local parts = utilities.split_string(item_str, ":")
local etym_item = parts[1] or ""
local display_form = parts[2]
local leftover_homonym = etym_item:match("#(.+)$")
if leftover_homonym and not homonym then
homonym = leftover_homonym
end
etym_item = etym_item:gsub("#.+$", "")
local parsed_item = {
lang_code = current_lang,
etymon_item = etym_item,
homonym = homonym,
display_form = display_form,
sup = sup, -- trailing superscript content (string), e.g. "?"
}
if sup == "?" then
parsed_item.uncertainty = true
end
table.insert(parsed_items, parsed_item)
end
return {
descendant_type = descendant_type,
initial_lang = initial_lang,
uncertainty = global_uncertainty,
items = parsed_items
}
end
function parser.parse_field(field_name, value)
if not value or value == "" then return {} end
local parsers = {
term = parser.parse_term,
spelling = parser.parse_spelling,
gloss = parser.parse_gloss,
reference = parser.parse_reference,
etymon = parser.parse_etymon,
}
local parser_fn = parsers[field_name]
return parser_fn and parser_fn(value) or {raw = value}
end
function parser.parse_entry(entry, lang_code)
local fields = {"term", "spelling", "gloss", "reference", "etymon", "notes"}
local parsed = {lang_code = lang_code}
for i, field in ipairs(fields) do
local raw = entry[i] or ""
local sep = require("Module:parameters").value_separator or "\\"
local values = utilities.split_string(raw, sep)
parsed[field] = {}
for _, val in ipairs(values) do
table.insert(parsed[field], parser.parse_field(field, utilities.trim_string(val)))
end
end
return parsed
end
return parser