parser: Difference between revisions
From SUALEX
Created page with "-- Module:parser local utilities = require("Module:utilities") local parser = {} function parser.parse_term(value) if not value or value == "" then return {} end local parts = utilities.split_string(value, ":") local term_item = parts[1] or "" local display_form = parts[2] -- capture whatever follows # as homonym (string), e.g. "1" for "#1" local homonym = term_item:match("#(.+)$") term_item = term_item:gsub("#.+$", "") return {term_ite..." |
No edit summary |
||
| Line 2: | Line 2: | ||
local utilities = require("Module:utilities") | local utilities = require("Module:utilities") | ||
local parameters = require("Module:parameters") | |||
local parser = {} | local parser = {} | ||
| Line 7: | Line 8: | ||
function parser.parse_term(value) | function parser.parse_term(value) | ||
if not value or value == "" then return {} end | if not value or value == "" then return {} end | ||
-- allow escaped colons inside term values | |||
local parts = utilities.split_string(value, ":") | local parts = utilities.split_string(value, ":") | ||
local term_item = parts[1] or "" | local term_item = parts[1] or "" | ||
local display_form = parts[2] | local display_form = parts[2] | ||
local homonym = term_item:match("#(.+)$") | local homonym = term_item:match("#(.+)$") | ||
term_item = term_item:gsub("#.+$", "") | term_item = term_item:gsub("#.+$", "") | ||
| Line 25: | Line 26: | ||
if not value or value == "" then return {} end | if not value or value == "" then return {} end | ||
-- capture trailing #... as sup (e.g. "?" from "#?") | -- capture trailing #... as sup (e.g. "?" from "#?") | ||
local sup = value:match("#( | local sup = value:match("#([^#]+)$") | ||
local raw = value:gsub("#([^#]+)$", "") | |||
return {gloss_item = | return {gloss_item = raw, sup = sup} | ||
end | end | ||
| Line 36: | Line 37: | ||
end | end | ||
-- etymon format: TYPE:lang?:items | |||
-- items separated by + ; language is sticky (set once at start or overridden per item) | |||
-- supports escaped colons (\:) inside item forms | |||
function parser.parse_etymon(value) | function parser.parse_etymon(value) | ||
if not value or value == "" then return {} end | if not value or value == "" then return {} end | ||
value = utilities.trim_string(value) | value = utilities.trim_string(value) | ||
local colon_pos = | |||
-- escape literal \: so subsequent plain find() works on unescaped colons | |||
local escaped = utilities.escape_colons_for_split(value) | |||
local colon_pos = escaped:find(":", 1, true) | |||
if not colon_pos then return {} end | if not colon_pos then return {} end | ||
-- | -- raw type (unescaped); strip global "#?" only from the type and set parsed.uncertainty | ||
local initial_lang = | local raw_type = utilities.trim_string( | ||
if | utilities.unescape_colons_after_split(escaped:sub(1, colon_pos - 1)) | ||
) | |||
local etymon_uncertainty = false | |||
if raw_type:match("#%?$") then | |||
etymon_uncertainty = true | |||
raw_type = raw_type:gsub("#%?$", "") | |||
end | |||
local descendant_type = raw_type | |||
local sub_escaped = escaped:sub(colon_pos + 1) | |||
-- detect an initial language token at the start (e.g. "bew:..."); must be an unescaped colon | |||
local initial_lang = nil | |||
local init_col = sub_escaped:find(":", 1, true) | |||
if init_col then | |||
local maybe_lang = sub_escaped:sub(1, init_col - 1) | |||
-- allow alphanumerics and hyphen in language codes | |||
if maybe_lang:match("^[%w%-]+$") then | |||
initial_lang = utilities.unescape_colons_after_split(maybe_lang) | |||
sub_escaped = sub_escaped:sub(init_col + 1) | |||
end | |||
end | end | ||
-- split on '+' (items) | -- split on '+' (items) in the escaped string so literal \: is preserved | ||
local | local item_strings_escaped = {} | ||
for item in ( | for item in (sub_escaped .. parameters.etymon_item_separator):gmatch("(.-)" .. parameters.etymon_item_separator) do | ||
local trimmed = utilities.trim_string(item) | local trimmed = utilities.trim_string(item) | ||
if trimmed ~= "" then table.insert( | if trimmed ~= "" then table.insert(item_strings_escaped, trimmed) end | ||
end | end | ||
local parsed_items = {} | local parsed_items = {} | ||
local current_lang = initial_lang | local current_lang = initial_lang -- sticky across + items | ||
for _, | for _, esc_item in ipairs(item_strings_escaped) do | ||
local | local item_escaped = utilities.trim_string(esc_item) | ||
-- | -- see if this item explicitly overrides language: look for the first unescaped ':' | ||
local | local override_lang = nil | ||
if | local colon_local = item_escaped:find(":", 1, true) | ||
local body_escaped = item_escaped | |||
if colon_local then | |||
local maybe_lang = item_escaped:sub(1, colon_local - 1) | |||
if maybe_lang:match("^[%w%-]+$") then | |||
override_lang = utilities.unescape_colons_after_split(maybe_lang) | |||
body_escaped = item_escaped:sub(colon_local + 1) | |||
end | |||
end | end | ||
if | |||
if override_lang then current_lang = override_lang end | |||
-- now unescape the item body so we can parse #... and display parts | |||
local item_str = utilities.unescape_colons_after_split(body_escaped) | |||
item_str = utilities.trim_string(item_str) | |||
local sup = nil | local sup = nil | ||
local homonym = nil | local homonym = nil | ||
-- detect trailing #... (could be two '#'): last = sup or maybe homonym | |||
local last = item_str:match("#([^#]+)$") | local last = item_str:match("#([^#]+)$") | ||
if last then | if last then | ||
local before_last = item_str:sub(1, -(#last) - 2) -- remove '#...' suffix | |||
local before_last = item_str:sub(1, -(#last) - 2) -- remove '#' | |||
local second = before_last:match("#([^#]+)$") | local second = before_last:match("#([^#]+)$") | ||
if second then | if second then | ||
| Line 88: | Line 118: | ||
homonym = second | homonym = second | ||
sup = last | sup = last | ||
item_str = before_last:sub(1, -(#second) - 2) | item_str = before_last:sub(1, -(#second) - 2) | ||
else | else | ||
-- | -- single '#': either sup if '?' else homonym | ||
if last == "?" then | if last == "?" then | ||
sup = last | sup = last | ||
item_str = before_last | item_str = before_last | ||
else | else | ||
homonym = last | homonym = last | ||
item_str = before_last | item_str = before_last | ||
end | end | ||
end | end | ||
| Line 104: | Line 133: | ||
item_str = utilities.trim_string(item_str) | item_str = utilities.trim_string(item_str) | ||
-- the remaining item_str may contain a colon separating etymon_item and display_form; use split_string to honor any escaped colons | |||
local parts = utilities.split_string(item_str, ":") | local parts = utilities.split_string(item_str, ":") | ||
local etym_item = parts[1] or "" | local etym_item = parts[1] or "" | ||
local display_form = parts[2] | local display_form = parts[2] | ||
-- leftover homonym inside etym_item (#N) | |||
local leftover_homonym = etym_item:match("#(.+)$") | local leftover_homonym = etym_item:match("#(.+)$") | ||
if leftover_homonym and not homonym then | if leftover_homonym and not homonym then | ||
| Line 113: | Line 144: | ||
end | end | ||
etym_item = etym_item:gsub("#.+$", "") | etym_item = etym_item:gsub("#.+$", "") | ||
local lang_code_final = current_lang or "unknown" | |||
local parsed_item = { | local parsed_item = { | ||
lang_code = | lang_code = lang_code_final, | ||
etymon_item = etym_item, | etymon_item = etym_item, | ||
homonym = homonym, | homonym = homonym, | ||
display_form = display_form, | display_form = display_form, | ||
sup = sup, -- | sup = sup, -- keep item-level sup for rendering only | ||
} | } | ||
table.insert(parsed_items, parsed_item) | table.insert(parsed_items, parsed_item) | ||
| Line 131: | Line 160: | ||
return { | return { | ||
descendant_type = descendant_type, | descendant_type = descendant_type, | ||
items = parsed_items, | |||
uncertainty = etymon_uncertainty | |||
items = parsed_items | |||
} | } | ||
end | end | ||
-- parse a full entry row; allow old format (single gloss) and new format (gloss_en, gloss_es) | |||
function parser.parse_entry(entry, lang_code) | function parser.parse_entry(entry, lang_code) | ||
local fields = {"term", "spelling", " | local fields = {"term", "spelling", "gloss_en", "gloss_es", "reference", "etymon", "notes"} | ||
local parsed = {lang_code = lang_code} | local parsed = {lang_code = lang_code} | ||
for i, field in ipairs(fields) do | for i, field in ipairs(fields) do | ||
local raw = entry[i] or "" | local raw = entry[i] or "" | ||
local sep = | local sep = parameters.value_separator or "\\" | ||
local values = utilities.split_string(raw, sep) | local values = utilities.split_string(raw, sep) | ||
parsed[field] = {} | parsed[field] = {} | ||
for _, val in ipairs(values) do | for _, val in ipairs(values) do | ||
table.insert(parsed[field], parser. | local trimmed = utilities.trim_string(val) | ||
if trimmed ~= "" then | |||
if field:match("^gloss") then | |||
table.insert(parsed[field], parser.parse_gloss(trimmed)) | |||
elseif field == "term" then | |||
table.insert(parsed[field], parser.parse_term(trimmed)) | |||
elseif field == "spelling" then | |||
table.insert(parsed[field], parser.parse_spelling(trimmed)) | |||
elseif field == "reference" then | |||
table.insert(parsed[field], parser.parse_reference(trimmed)) | |||
elseif field == "etymon" then | |||
table.insert(parsed[field], parser.parse_etymon(trimmed)) | |||
else | |||
table.insert(parsed[field], { raw = trimmed }) | |||
end | |||
end | |||
end | end | ||
end | end | ||
-- backward compatibility: if gloss_en empty but entry[3] existed in old format, populate gloss_en from that | |||
if #parsed.gloss_en == 0 and entry[3] and entry[3] ~= "" then | |||
table.insert(parsed.gloss_en, parser.parse_gloss(entry[3])) | |||
end | |||
return parsed | return parsed | ||
end | end | ||
return parser | return parser | ||
Latest revision as of 07:26, 29 January 2026
Documentation for this module may be created at Module:parser/doc
-- Module:parser
local utilities = require("Module:utilities")
local parameters = require("Module:parameters")
local parser = {}
function parser.parse_term(value)
if not value or value == "" then return {} end
-- allow escaped colons inside term values
local parts = utilities.split_string(value, ":")
local term_item = parts[1] or ""
local display_form = parts[2]
local homonym = term_item:match("#(.+)$")
term_item = term_item:gsub("#.+$", "")
return {term_item = term_item, homonym = homonym, display_form = display_form}
end
function parser.parse_spelling(value)
if not value or value == "" then return {} end
local parts = utilities.split_string(value, ":")
return {spelling_item = parts[1], orthography = parts[2]}
end
function parser.parse_gloss(value)
if not value or value == "" then return {} end
-- capture trailing #... as sup (e.g. "?" from "#?")
local sup = value:match("#([^#]+)$")
local raw = value:gsub("#([^#]+)$", "")
return {gloss_item = raw, sup = sup}
end
function parser.parse_reference(value)
if not value or value == "" then return {} end
local parts = utilities.split_string(value, ":")
return {author = parts[1], year = parts[2], page = parts[3]}
end
-- etymon format: TYPE:lang?:items
-- items separated by + ; language is sticky (set once at start or overridden per item)
-- supports escaped colons (\:) inside item forms
function parser.parse_etymon(value)
if not value or value == "" then return {} end
value = utilities.trim_string(value)
-- escape literal \: so subsequent plain find() works on unescaped colons
local escaped = utilities.escape_colons_for_split(value)
local colon_pos = escaped:find(":", 1, true)
if not colon_pos then return {} end
-- raw type (unescaped); strip global "#?" only from the type and set parsed.uncertainty
local raw_type = utilities.trim_string(
utilities.unescape_colons_after_split(escaped:sub(1, colon_pos - 1))
)
local etymon_uncertainty = false
if raw_type:match("#%?$") then
etymon_uncertainty = true
raw_type = raw_type:gsub("#%?$", "")
end
local descendant_type = raw_type
local sub_escaped = escaped:sub(colon_pos + 1)
-- detect an initial language token at the start (e.g. "bew:..."); must be an unescaped colon
local initial_lang = nil
local init_col = sub_escaped:find(":", 1, true)
if init_col then
local maybe_lang = sub_escaped:sub(1, init_col - 1)
-- allow alphanumerics and hyphen in language codes
if maybe_lang:match("^[%w%-]+$") then
initial_lang = utilities.unescape_colons_after_split(maybe_lang)
sub_escaped = sub_escaped:sub(init_col + 1)
end
end
-- split on '+' (items) in the escaped string so literal \: is preserved
local item_strings_escaped = {}
for item in (sub_escaped .. parameters.etymon_item_separator):gmatch("(.-)" .. parameters.etymon_item_separator) do
local trimmed = utilities.trim_string(item)
if trimmed ~= "" then table.insert(item_strings_escaped, trimmed) end
end
local parsed_items = {}
local current_lang = initial_lang -- sticky across + items
for _, esc_item in ipairs(item_strings_escaped) do
local item_escaped = utilities.trim_string(esc_item)
-- see if this item explicitly overrides language: look for the first unescaped ':'
local override_lang = nil
local colon_local = item_escaped:find(":", 1, true)
local body_escaped = item_escaped
if colon_local then
local maybe_lang = item_escaped:sub(1, colon_local - 1)
if maybe_lang:match("^[%w%-]+$") then
override_lang = utilities.unescape_colons_after_split(maybe_lang)
body_escaped = item_escaped:sub(colon_local + 1)
end
end
if override_lang then current_lang = override_lang end
-- now unescape the item body so we can parse #... and display parts
local item_str = utilities.unescape_colons_after_split(body_escaped)
item_str = utilities.trim_string(item_str)
local sup = nil
local homonym = nil
-- detect trailing #... (could be two '#'): last = sup or maybe homonym
local last = item_str:match("#([^#]+)$")
if last then
local before_last = item_str:sub(1, -(#last) - 2) -- remove '#...' suffix
local second = before_last:match("#([^#]+)$")
if second then
-- two hashes: second = homonym, last = sup
homonym = second
sup = last
item_str = before_last:sub(1, -(#second) - 2)
else
-- single '#': either sup if '?' else homonym
if last == "?" then
sup = last
item_str = before_last
else
homonym = last
item_str = before_last
end
end
end
item_str = utilities.trim_string(item_str)
-- the remaining item_str may contain a colon separating etymon_item and display_form; use split_string to honor any escaped colons
local parts = utilities.split_string(item_str, ":")
local etym_item = parts[1] or ""
local display_form = parts[2]
-- leftover homonym inside etym_item (#N)
local leftover_homonym = etym_item:match("#(.+)$")
if leftover_homonym and not homonym then
homonym = leftover_homonym
end
etym_item = etym_item:gsub("#.+$", "")
local lang_code_final = current_lang or "unknown"
local parsed_item = {
lang_code = lang_code_final,
etymon_item = etym_item,
homonym = homonym,
display_form = display_form,
sup = sup, -- keep item-level sup for rendering only
}
table.insert(parsed_items, parsed_item)
end
return {
descendant_type = descendant_type,
items = parsed_items,
uncertainty = etymon_uncertainty
}
end
-- parse a full entry row; allow old format (single gloss) and new format (gloss_en, gloss_es)
function parser.parse_entry(entry, lang_code)
local fields = {"term", "spelling", "gloss_en", "gloss_es", "reference", "etymon", "notes"}
local parsed = {lang_code = lang_code}
for i, field in ipairs(fields) do
local raw = entry[i] or ""
local sep = parameters.value_separator or "\\"
local values = utilities.split_string(raw, sep)
parsed[field] = {}
for _, val in ipairs(values) do
local trimmed = utilities.trim_string(val)
if trimmed ~= "" then
if field:match("^gloss") then
table.insert(parsed[field], parser.parse_gloss(trimmed))
elseif field == "term" then
table.insert(parsed[field], parser.parse_term(trimmed))
elseif field == "spelling" then
table.insert(parsed[field], parser.parse_spelling(trimmed))
elseif field == "reference" then
table.insert(parsed[field], parser.parse_reference(trimmed))
elseif field == "etymon" then
table.insert(parsed[field], parser.parse_etymon(trimmed))
else
table.insert(parsed[field], { raw = trimmed })
end
end
end
end
-- backward compatibility: if gloss_en empty but entry[3] existed in old format, populate gloss_en from that
if #parsed.gloss_en == 0 and entry[3] and entry[3] ~= "" then
table.insert(parsed.gloss_en, parser.parse_gloss(entry[3]))
end
return parsed
end
return parser