Actions

Module

Module:parser

From SUALEX

Documentation for this module may be created at Module:parser/doc

-- Module:parser

local utilities = require("Module:utilities")
local parameters = require("Module:parameters")

local parser = {}

function parser.parse_term(value)
    if not value or value == "" then return {} end
    -- allow escaped colons inside term values
    local parts = utilities.split_string(value, ":")
    local term_item = parts[1] or ""
    local display_form = parts[2]
    local homonym = term_item:match("#(.+)$")
    term_item = term_item:gsub("#.+$", "")
    return {term_item = term_item, homonym = homonym, display_form = display_form}
end

function parser.parse_spelling(value)
    if not value or value == "" then return {} end
    local parts = utilities.split_string(value, ":")
    return {spelling_item = parts[1], orthography = parts[2]}
end

function parser.parse_gloss(value)
    if not value or value == "" then return {} end
    -- capture trailing #... as sup (e.g. "?" from "#?")
    local sup = value:match("#([^#]+)$")
    local raw = value:gsub("#([^#]+)$", "")
    return {gloss_item = raw, sup = sup}
end

function parser.parse_reference(value)
    if not value or value == "" then return {} end
    local parts = utilities.split_string(value, ":")
    return {author = parts[1], year = parts[2], page = parts[3]}
end

-- etymon format: TYPE:lang?:items
-- items separated by + ; language is sticky (set once at start or overridden per item)
-- supports escaped colons (\:) inside item forms
function parser.parse_etymon(value)
    if not value or value == "" then return {} end
    value = utilities.trim_string(value)

    -- escape literal \: so subsequent plain find() works on unescaped colons
    local escaped = utilities.escape_colons_for_split(value)
    local colon_pos = escaped:find(":", 1, true)
    if not colon_pos then return {} end

    -- raw type (unescaped); strip global "#?" only from the type and set parsed.uncertainty
    local raw_type = utilities.trim_string(
        utilities.unescape_colons_after_split(escaped:sub(1, colon_pos - 1))
    )

    local etymon_uncertainty = false
    if raw_type:match("#%?$") then
        etymon_uncertainty = true
        raw_type = raw_type:gsub("#%?$", "")
    end
    local descendant_type = raw_type

    local sub_escaped = escaped:sub(colon_pos + 1)

    -- detect an initial language token at the start (e.g. "bew:..."); must be an unescaped colon
    local initial_lang = nil
    local init_col = sub_escaped:find(":", 1, true)
    if init_col then
        local maybe_lang = sub_escaped:sub(1, init_col - 1)
        -- allow alphanumerics and hyphen in language codes
        if maybe_lang:match("^[%w%-]+$") then
            initial_lang = utilities.unescape_colons_after_split(maybe_lang)
            sub_escaped = sub_escaped:sub(init_col + 1)
        end
    end

    -- split on '+' (items) in the escaped string so literal \: is preserved
    local item_strings_escaped = {}
    for item in (sub_escaped .. parameters.etymon_item_separator):gmatch("(.-)" .. parameters.etymon_item_separator) do
        local trimmed = utilities.trim_string(item)
        if trimmed ~= "" then table.insert(item_strings_escaped, trimmed) end
    end

    local parsed_items = {}
    local current_lang = initial_lang -- sticky across + items

    for _, esc_item in ipairs(item_strings_escaped) do
        local item_escaped = utilities.trim_string(esc_item)

        -- see if this item explicitly overrides language:  look for the first unescaped ':'
        local override_lang = nil
        local colon_local = item_escaped:find(":", 1, true)
        local body_escaped = item_escaped
        if colon_local then
            local maybe_lang = item_escaped:sub(1, colon_local - 1)
            if maybe_lang:match("^[%w%-]+$") then
                override_lang = utilities.unescape_colons_after_split(maybe_lang)
                body_escaped = item_escaped:sub(colon_local + 1)
            end
        end

        if override_lang then current_lang = override_lang end

        -- now unescape the item body so we can parse #... and display parts
        local item_str = utilities.unescape_colons_after_split(body_escaped)
        item_str = utilities.trim_string(item_str)

        local sup = nil
        local homonym = nil

        -- detect trailing #... (could be two '#'): last = sup or maybe homonym
        local last = item_str:match("#([^#]+)$")
        if last then
            local before_last = item_str:sub(1, -(#last) - 2) -- remove '#...' suffix
            local second = before_last:match("#([^#]+)$")
            if second then
                -- two hashes: second = homonym, last = sup
                homonym = second
                sup = last
                item_str = before_last:sub(1, -(#second) - 2)
            else
                -- single '#': either sup if '?' else homonym
                if last == "?" then
                    sup = last
                    item_str = before_last
                else
                    homonym = last
                    item_str = before_last
                end
            end
        end

        item_str = utilities.trim_string(item_str)

        -- the remaining item_str may contain a colon separating etymon_item and display_form; use split_string to honor any escaped colons
        local parts = utilities.split_string(item_str, ":")
        local etym_item = parts[1] or ""
        local display_form = parts[2]

        -- leftover homonym inside etym_item (#N)
        local leftover_homonym = etym_item:match("#(.+)$")
        if leftover_homonym and not homonym then
            homonym = leftover_homonym
        end
        etym_item = etym_item:gsub("#.+$", "")

        local lang_code_final = current_lang or "unknown"

        local parsed_item = {
            lang_code = lang_code_final,
            etymon_item = etym_item,
            homonym = homonym,
            display_form = display_form,
            sup = sup,  -- keep item-level sup for rendering only
        }

        table.insert(parsed_items, parsed_item)
    end

    return {
        descendant_type = descendant_type,
        items = parsed_items,
        uncertainty = etymon_uncertainty
    }
end

-- parse a full entry row; allow old format (single gloss) and new format (gloss_en, gloss_es)
function parser.parse_entry(entry, lang_code)
    local fields = {"term", "spelling", "gloss_en", "gloss_es", "reference", "etymon", "notes"}
    local parsed = {lang_code = lang_code}
    for i, field in ipairs(fields) do
        local raw = entry[i] or ""
        local sep = parameters.value_separator or "\\"
        local values = utilities.split_string(raw, sep)
        parsed[field] = {}
        for _, val in ipairs(values) do
            local trimmed = utilities.trim_string(val)
            if trimmed ~= "" then
                if field:match("^gloss") then
                    table.insert(parsed[field], parser.parse_gloss(trimmed))
                elseif field == "term" then
                    table.insert(parsed[field], parser.parse_term(trimmed))
                elseif field == "spelling" then
                    table.insert(parsed[field], parser.parse_spelling(trimmed))
                elseif field == "reference" then
                    table.insert(parsed[field], parser.parse_reference(trimmed))
                elseif field == "etymon" then
                    table.insert(parsed[field], parser.parse_etymon(trimmed))
                else
                    table.insert(parsed[field], { raw = trimmed })
                end
            end
        end
    end

    -- backward compatibility: if gloss_en empty but entry[3] existed in old format, populate gloss_en from that
    if #parsed.gloss_en == 0 and entry[3] and entry[3] ~= "" then
        table.insert(parsed.gloss_en, parser.parse_gloss(entry[3]))
    end

    return parsed
end

return parser