android13/external/cldr/common/transforms/si-si_FONIPA.xml

<?xml version="1.0" encoding="UTF-8" ?>
<!DOCTYPE supplementalData SYSTEM "../../common/dtd/ldmlSupplemental.dtd">
<!-- Copyright © 1991-2015 Unicode, Inc.
CLDR data files are interpreted according to the LDML specification (http://unicode.org/reports/tr35/)
For terms of use, see http://www.unicode.org/copyright.html -->
<supplementalData>
	<version number="$Revision$" />
	<transforms>
		<transform source="si" target="si_FONIPA" direction="forward" alias="si-fonipa-t-si">
			<tRule><![CDATA[
# Sinhala pronunciation rules
#
# Output
#     k ɡ ŋ ᵑɡ c ɟ ɲ ʈ ɖ ⁿɖ t d n ⁿd p b m ᵐb j r l w ʃ s h f
#     ə əː a aː æ æː i iː u uː e eː o oː
#
# References
# [1] Asanka Wasala, Ruvan Weerasinghe, and Kumudu Gamage:
#     Sinhala Grapheme-to-Phoneme Conversion and Rules for Schwa Epenthesis.
#     Proceedings of the COLING/ACL 2006 Main Conference Poster Sessions,
#     pages 890–897. http://www.aclweb.org/anthology/P06-2114

# Simplify ya + yansaya to plain ya after a consonant.
[\u0D9A-\u0DC6] \u0DCA (\u200D)? { ය්‍ය → ය;

# Delete ZWNJ and ZWJ to simplify further processing.
\u200C → ;
\u200D → ;

# Insert a schwa after every consonant that is not followed by a dependent vowel
# or virama.
::Null;
([\u0D9A-\u0DC6]) } [^\u0DCA-\u0DDF \u0DF2\u0DF3] → $1 ə;

# Pronunciation rules proper.
::Null;

# fප is an alternative spelling of ෆ.
# This occurs e.g. in ඩේවිඩ් කොපර්fපීල්ඩ් (David Copperfield)
# [see http://bradshawofthefuture.blogspot.com/2013/02/f.html].
[Ff]ප → f;

# zස is seemingly the only way to unambiguously indicate a voiced /z/ sound.
# This occurs in e.g. ඇල්zසයිම' රෝගය (Alzheimer's disease)
# [see https://si.wikipedia.org/wiki/ඇල්zසයිම%27_රෝගය]
# or in zසීබ්‍රා (zebra) [see https://si.wikipedia.org/wiki/‍zසීබ්‍රා].
[Zz]ස → z;

ං → ŋ;
o → ŋ;  # common substitution for anusvaraya

ඃ ([\u0D9A-\u0DC6]) → | $1 \u0DCA $1;  # TODO: check which consonants geminate
ඃ → h;

අ → a;
ආ → aː;
ඇ → æ;
ඈ → æː;
ඉ → i;
ඊ → iː;
උ → u;
ඌ → uː;
ඍ → ri;
ඎ → ruː;
ඏ → ilu;
ඐ → iluː;
එ → e;
ඒ → eː;
ඓ → aj;
ඔ → o;
ඕ → oː;
ඖ → aw;  # TODO: check if this is correct

ක → k;
ඛ → k;
ග → ɡ;
ඝ → ɡ;
ඞ → ŋ;
ඟ → ᵑɡ;
ච → c;
ඡ → c;
ජ → ɟ;
ඣ → ɟ;
ඤ → ɲ;
ඥ → kɲ;  # TODO: double-check
ඦ → ɟ;
ට → ʈ;
ඨ → ʈ;
ඩ → ɖ;
ඪ → ɖ;
ණ → n;
ඬ → ⁿɖ;
ත → t;
ථ → t;
ද → d;
ධ → d;
න → n;
ඳ → ⁿd;
ප → p;
ඵ → p;
බ → b;
භ → b;
ම → m;
ඹ → ᵐb;
ය → j;
ර → r;
ල → l;
ව → w;
ශ → ʃ;
ෂ → ʃ;
ස → s;
හ → h;
ළ → l;
ෆ → f;

\u0DCA → ;  # delete virama

ා → aː;
ැ → æ;
ෑ → æː;
\u0DD2 → i;
\u0DD3 → iː;
\u0DD4 → u;
\u0DD6 → uː;
ෘ → ru;
ෙ → e;
ේ → eː;
ෛ → aj;
ො → o;
ෝ → oː;
ෞ → aw;  # TODO: check if this is correct
ෟ → lu;
ෲ → ruː;
ෳ → luː;

# Heuristics for turning /ə/ into /a/. Based on [1].

$c=[k ɡ ŋ {ᵑɡ} c ɟ ɲ ʈ ɖ {ⁿɖ} t d n {ⁿd} p b m {ᵐb} j r l w ʃ s z h f];

$s=[:^L:];

# Rule #1
::Null;
$s sv    { ə      → ə;  # exception (a)
$s k     { ə } r  → ə;  # exception (b)
$s $c    { ə } $s → ə;  # exception (c)
$s $c $c { ə      → a;
$s $c    { ə      → a;

# Rule #2
::Null;
$c r { ə } $c → a;  # clause (a) and (b)
$c r { a } h  → a;  # clause (d), exception
$c r { a } $c → ə;  # clause (c)

# Rule #3
# The paper is unclear about what this rule means. The interpretation here
# assumes that "preceded" in the paper is a typo and should be read "followed".
::Null;
[a e æ o ə] h { ə → a;

# Rules #4 through #7
::Null;
    ə } $c $c     → a;  # Rule #4
    ə } [rbɖʈ] $s → ə;  # Rule #5 exception
    ə } $c     $s → a;  # Rule #5
    ə } ji     $s → a;  # Rule #6
k { ə } [rl] u    → a;  # Rule #7

# Rule #8
# Note that the paper doesn't say explicitly that this rule should be
# anchored at the beginning of a word, but the remarks before the rules
# seem to imply this.
::Null;
$s k { a } l[aeo]ːj   → ə;  # Typo in paper: /j/ was /y/.
$s k { a } le[mh][ui] → ə;
$s k { alə } h[ui]    → əle;
$s k { a } lə         → ə;

# Diphthongs
::Null;
www+ → ww;  # යෞව්වන
[i {iː} e {eː} æ {æː} o {oː} a {aː}] { wu → w;

əji → aj;
iji → iː;  # perhaps: ij
[u {uː} e {eː} æ {æː} o {oː} a {aː}] { ji → j;
			]]></tRule>
		</transform>
	</transforms>
</supplementalData>