Module:Sandbox/DarmaniLink
require('strict');
local utf8 = require("Module:Unicode data")
-- Converts romanji kana to modified hepburn, I recommend subst:ing
-- standard long vowel patterns
local diacritics = {
["aa"] = "ā",
["uu"] = "ū",
["ee"] = "ē",
["oo"] = "ō",
["ou"] = "ō",
["Aa"] = "Ā",
["Uu"] = "Ū",
["Ee"] = "Ē",
["Oo"] = "Ō",
["Ou"] = "Ō",
["AA"] = "Ā",
["UU"] = "Ū",
["EE"] = "Ē",
["OO"] = "Ō",
["OU"] = "Ō"
}
local function romanjiToHepburn(romanji)
for target, replacement in pairs(diacritics) do
romanji = romanji:gsub(target, replacement)
end
return romanji
end
--map is made local so it wont get cached every single time this is ran
local function kanaToHepburn(kana)
local romanji = ""
--TODO split map up into consonant groups and create a jump table based off the unicode value
local kanaMap = {
["あ"] = "a", ["い"] = "i", ["う"] = "u", ["え"] = "e", ["お"] = "o",
["か"] = "ka", ["き"] = "ki", ["く"] = "ku", ["け"] = "ke", ["こ"] = "ko",
["さ"] = "sa", ["し"] = "shi", ["す"] = "su", ["せ"] = "se", ["そ"] = "so",
["た"] = "ta", ["ち"] = "chi", ["つ"] = "tsu", ["て"] = "te", ["と"] = "to",
["な"] = "na", ["に"] = "ni", ["ぬ"] = "nu", ["ね"] = "ne", ["の"] = "no",
["は"] = "ha", ["ひ"] = "hi", ["ふ"] = "fu", ["へ"] = "he", ["ほ"] = "ho",
["ま"] = "ma", ["み"] = "mi", ["む"] = "mu", ["め"] = "me", ["も"] = "mo",
["や"] = "ya", ["ゆ"] = "yu", ["よ"] = "yo", --["ゃ"] = "ya", ["ゅ"] = "yu", ["ょ"] = "yo",
["ら"] = "ra", ["り"] = "ri", ["る"] = "ru", ["れ"] = "re", ["ろ"] = "ro",
["わ"] = "wa", ["ゐ"] = "wi", ["ゑ"] = "we", ["を"] = "wo",
["ん"] = "n",
["が"] = "ga", ["ぎ"] = "gi", ["ぐ"] = "gu", ["げ"] = "ge", ["ご"] = "go",
["ざ"] = "za", ["じ"] = "ji", ["ず"] = "zu", ["ぜ"] = "ze", ["ぞ"] = "zo",
["だ"] = "da", ["ぢ"] = "ji", ["づ"] = "dzu", ["で"] = "de", ["ど"] = "do",
["ば"] = "ba", ["び"] = "bi", ["ぶ"] = "bu", ["べ"] = "be", ["ぼ"] = "bo",
["ぱ"] = "pa", ["ぴ"] = "pi", ["ぷ"] = "pu", ["ぺ"] = "pe", ["ぽ"] = "po",
["ゔ"] = "vu"
}
local smallKanaMap = {
["ぁ"] = "a", ["ぃ"] = "i", ["ぅ"] = "u", ["ぇ"] = "e", ["ぉ"] = "o",
["ゕ"] = "ka", ["ゖ"] = "ke",
["ゃ"] = "ya", ["ゅ"] = "yu", ["ょ"] = "yo"
}
for character in mw.ustring.gcodepoint(kana) do -- iterates over each kana to convert it to romanji without diacritics
local char = mw.ustring.char(character) -- this is a really bad way of doing a foreach but
local romanization = kanaMap[char] -- i'm new to mw lua, and couldn't get it to play nice
local smallRomanization = smallKanaMap[char]
if romanization then -- if kana was found in kanaMap
romanji = romanji .. romanization
elseif smallRomanization then -- if kana was not found in kanaMap, but found in smallKanaMap
local lasttwo = romanji:sub(-3,-2)
if(lasttwo == "sh" or lasttwo == "ch" or lasttwo == "ts" or lasttwo == "dz") then -- special case for the 3 letter romanizations
romanji = romanji:sub(1, -2) .. smallKanaMap[char]:sub(-1) -- since しゅ=> shu rather than shyu or syu
else
romanji = romanji:sub(1, -2) .. smallKanaMap[char] -- remove vowel, append small vowel or other letters
end -- this will cause issues if someone tries something like あぁぁぁぁぁ => a
else -- special rule for double little vowels maybe? will make this more expensive
romanji = romanji .. char -- character was not in either map, append it directly
end
end
-- Replace "っ" with the next consonant
for i = 1, mw.ustring.len(romanji) do
local chr = mw.ustring.sub(romanji, i, i) -- string[i]
if chr == "っ" then
local nextChar = mw.ustring.sub(romanji, i + 1, i + 1) -- get the next letter after the small tsu
if nextChar and not nextChar:match("[aeiou]") then -- if it's a vowel, we don't replicate it.
romanji = mw.ustring.sub(romanji, 1, i-1) .. nextChar .. mw.ustring.sub(romanji, i + 1) -- string before the small tsu + next character + string after that one xtsu
end -- surely there's a better way?
end
end
return romanjiToHepburn(romanji) -- kana is converted to romanji, now change it to hepburn
end -- TODO: add a flag to disable this, and return the normal romanji without the diacritics
-- checking for kana will need to check these bounds regardless
-- might as well convert at the same time
-- would it be better to have the kana conversion in the above function?
local function checkForKanaPresentAndConvert(data)
local kanaFound = false
local convertedString = ""
local kanaDelta = (mw.ustring.codepoint("ァ") - mw.ustring.codepoint("ぁ")) -- difference in the unicode table
local hiraganaLowerBound = mw.ustring.codepoint("ぁ") -- I know magic numbers are bad but it almost seems more worth
local hiraganaUpperBound = mw.ustring.codepoint("ゖ") -- it to use them in this context
local katakanaLowerBound = mw.ustring.codepoint("ァ") -- this is really expensive
local katakanaUpperBound = mw.ustring.codepoint("ヶ")
for c in mw.ustring.gcodepoint(data) do
if c<=127 then -- short circuit for ascii, which is the intended use.
-- kana support was intended to be a minor feature
elseif (hiraganaLowerBound <= c and c <= hiraganaUpperBound) then
kanaFound = true
elseif (katakanaLowerBound <= c and c <= katakanaUpperBound) then
kanaFound = true
c = c - kanaDelta -- convert to hiragana codepointwise so i dont have to remake the lookup table for katakana
end
convertedString = convertedString .. mw.ustring.char(c) -- append the processed character to the current running string
end -- this is bad, we're rebuilding the entire string just for katakana
-- maybe there's some string sub magic we can do?
return {kanaFound, convertedString}
end
local function toHepburnKana(data)
local processedData
if not data then -- short circuit
return
end
processedData = checkForKanaPresentAndConvert(data)
if processedData[1] then -- processedData[1] = kanaFound
return kanaToHepburn(processedData[2]) -- processedData[2] = convertedString
else
return romanjiToHepburn(data) -- kana not found, that should mean we were probably given romanji
end -- and if its other unicode, they just get that string back
end
local p = {}
local flags = {}
--TODO add a performant way to detect if there is kana in a string
--this could be expanded to use bopomofo too
function p.toHepburn(frame)
local data = frame.args[1]
local romanji = "#ERROR!" -- this should change, built in sanity test
local fromRomanji = false
if flags["romanji"] then
romanji = romanjiToHepburn(data)
fromRomanji = true
else
romanji = toHepburnKana(data)
fromRomanji = false
end
if flags["name"] then
romanji = romanji:gsub("(%a)([%w_']*)", function(first, rest)
return first:upper() .. rest:lower()
end)
end
return romanji
end
-- testing function for the lua console on the module itself
function p.toHepburnTEST(frame)
return frame.args[2]
end
return p
Content Disclaimer
Informasi ini disarikan dari Wikipedia dan disajikan kembali untuk tujuan edukasi. Konten tersedia di bawah lisensi CC BY-SA 3.0. Kami tidak bertanggung jawab atas ketidakakuratan data yang bersumber dari kontribusi publik tersebut.
- The information displayed on this website is sourced in part or in whole from Wikipedia and has been adapted for the purpose of restating it. We strive to provide accurate and relevant information, however:
- There is no guarantee of absolute accuracy. Wikipedia is an open, collaborative project that can be edited by anyone, so information is subject to change.
- It is not intended to constitute professional advice. The content displayed is for informational and educational purposes only. For important decisions (e.g., medical, legal, or financial), please consult a professional.
- Content copyright. Wikipedia is licensed under the Creative Commons Attribution-ShareAlike License (CC BY-SA). This means that content may be reused with appropriate attribution and shared under a similar license.
- Responsible use. Any risk arising from the use of information from this website is entirely the responsibility of the user.