Actions

Module

Module:parser

From SUALEX

Revision as of 07:24, 29 January 2026 by Jawad (talk | contribs) (Created page with "-- Module:parser local utilities = require("Module:utilities") local parser = {} function parser.parse_term(value) if not value or value == "" then return {} end local parts = utilities.split_string(value, ":") local term_item = parts[1] or "" local display_form = parts[2] -- capture whatever follows # as homonym (string), e.g. "1" for "#1" local homonym = term_item:match("#(.+)$") term_item = term_item:gsub("#.+$", "") return {term_ite...")
(diff) ← Older revision | Latest revision (diff) | Newer revision → (diff)

Documentation for this module may be created at Module:parser/doc

-- Module:parser

local utilities = require("Module:utilities")

local parser = {}

function parser.parse_term(value)
    if not value or value == "" then return {} end
    local parts = utilities.split_string(value, ":")
    local term_item = parts[1] or ""
    local display_form = parts[2]
    -- capture whatever follows # as homonym (string), e.g. "1" for "#1"
    local homonym = term_item:match("#(.+)$")
    term_item = term_item:gsub("#.+$", "")
    return {term_item = term_item, homonym = homonym, display_form = display_form}
end

function parser.parse_spelling(value)
    if not value or value == "" then return {} end
    local parts = utilities.split_string(value, ":")
    return {spelling_item = parts[1], orthography = parts[2]}
end

function parser.parse_gloss(value)
    if not value or value == "" then return {} end
    -- capture trailing #... as sup (e.g. "?" from "#?")
    local sup = value:match("#(.+)$")
    value = value:gsub("#.+$", "")
    return {gloss_item = value, sup = sup}
end

function parser.parse_reference(value)
    if not value or value == "" then return {} end
    local parts = utilities.split_string(value, ":")
    return {author = parts[1], year = parts[2], page = parts[3]}
end

function parser.parse_etymon(value)
    if not value or value == "" then return {} end
    value = utilities.trim_string(value)
    local colon_pos = value:find(":", 1, true)
    if not colon_pos then return {} end
    local descendant_type = utilities.trim_string(value:sub(1, colon_pos - 1))
    local global_uncertainty = descendant_type:match("#?%?$") and true or false
    descendant_type = descendant_type:gsub("#?%?$", "")
    local sub = value:sub(colon_pos + 1)

    -- try to capture an initial language token at the start (e.g. "bew:")
    local initial_lang = sub:match("^([a-z%-]+):")
    if initial_lang then
        sub = sub:gsub("^" .. initial_lang:gsub("%-", "%%-") .. ":", "", 1)
    else
        initial_lang = nil
    end
    sub = utilities.trim_string(sub)

    -- split on '+' (items); preserve order
    local item_strings = {}
    for item in (sub .. "+"):gmatch("(.-)%+") do
        local trimmed = utilities.trim_string(item)
        if trimmed ~= "" then table.insert(item_strings, trimmed) end
    end

    local parsed_items = {}
    local current_lang = initial_lang

    for _, raw_item in ipairs(item_strings) do
        local item_str = utilities.trim_string(raw_item)

        -- handle optional leading "lang:" inside this item (overrides current_lang)
        local lang_match = item_str:match("^([a-z%-]+):")
        if lang_match then
            current_lang = lang_match
            item_str = item_str:gsub("^" .. lang_match:gsub("%-", "%%-") .. ":", "", 1)
        end
        if not current_lang then current_lang = "unknown" end

        local sup = nil
        local homonym = nil

        local last = item_str:match("#([^#]+)$")
        if last then

            local before_last = item_str:sub(1, -(#last) - 2) -- remove '#' and last content
            local second = before_last:match("#([^#]+)$")
            if second then
                -- two hashes: second = homonym, last = sup
                homonym = second
                sup = last
                -- now remove the second '#...' from before_last to get base_and_rest
                item_str = before_last:sub(1, -(#second) - 2)
            else
                -- only one '#': decide whether it's homonym or sup
                if last == "?" then
                    sup = last
                    item_str = before_last -- base without final #?
                else
                    homonym = last
                    item_str = before_last -- base without final #N
                end
            end
        end

        item_str = utilities.trim_string(item_str)

        local parts = utilities.split_string(item_str, ":")
        local etym_item = parts[1] or ""
        local display_form = parts[2]

        local leftover_homonym = etym_item:match("#(.+)$")
        if leftover_homonym and not homonym then
            homonym = leftover_homonym
        end
        etym_item = etym_item:gsub("#.+$", "")

        local parsed_item = {
            lang_code = current_lang,
            etymon_item = etym_item,
            homonym = homonym,
            display_form = display_form,
            sup = sup,  -- trailing superscript content (string), e.g. "?"
        }

        if sup == "?" then
            parsed_item.uncertainty = true
        end

        table.insert(parsed_items, parsed_item)
    end

    return {
        descendant_type = descendant_type,
        initial_lang = initial_lang,
        uncertainty = global_uncertainty,
        items = parsed_items
    }
end

function parser.parse_field(field_name, value)
    if not value or value == "" then return {} end
    local parsers = {
        term = parser.parse_term,
        spelling = parser.parse_spelling,
        gloss = parser.parse_gloss,
        reference = parser.parse_reference,
        etymon = parser.parse_etymon,
    }
    local parser_fn = parsers[field_name]
    return parser_fn and parser_fn(value) or {raw = value}
end

function parser.parse_entry(entry, lang_code)
    local fields = {"term", "spelling", "gloss", "reference", "etymon", "notes"}
    local parsed = {lang_code = lang_code}
    for i, field in ipairs(fields) do
        local raw = entry[i] or ""
        local sep = require("Module:parameters").value_separator or "\\"
        local values = utilities.split_string(raw, sep)
        parsed[field] = {}
        for _, val in ipairs(values) do
            table.insert(parsed[field], parser.parse_field(field, utilities.trim_string(val)))
        end
    end
    return parsed
end

return parser