192 lines
		
	
	
		
			4.6 KiB
		
	
	
	
		
			XML
		
	
	
	
			
		
		
	
	
			192 lines
		
	
	
		
			4.6 KiB
		
	
	
	
		
			XML
		
	
	
	
| <?xml version="1.0" encoding="UTF-8" ?>
 | ||
| <!DOCTYPE supplementalData SYSTEM "../../common/dtd/ldmlSupplemental.dtd">
 | ||
| <!-- Copyright © 1991-2015 Unicode, Inc.
 | ||
| CLDR data files are interpreted according to the LDML specification (http://unicode.org/reports/tr35/)
 | ||
| For terms of use, see http://www.unicode.org/copyright.html -->
 | ||
| <supplementalData>
 | ||
| 	<version number="$Revision$" />
 | ||
| 	<transforms>
 | ||
| 		<transform source="si" target="si_FONIPA" direction="forward" alias="si-fonipa-t-si">
 | ||
| 			<tRule><![CDATA[
 | ||
| # Sinhala pronunciation rules
 | ||
| #
 | ||
| # Output
 | ||
| #     k ɡ ŋ ᵑɡ c ɟ ɲ ʈ ɖ ⁿɖ t d n ⁿd p b m ᵐb j r l w ʃ s h f
 | ||
| #     ə əː a aː æ æː i iː u uː e eː o oː
 | ||
| #
 | ||
| # References
 | ||
| # [1] Asanka Wasala, Ruvan Weerasinghe, and Kumudu Gamage:
 | ||
| #     Sinhala Grapheme-to-Phoneme Conversion and Rules for Schwa Epenthesis.
 | ||
| #     Proceedings of the COLING/ACL 2006 Main Conference Poster Sessions,
 | ||
| #     pages 890–897. http://www.aclweb.org/anthology/P06-2114
 | ||
| 
 | ||
| # Simplify ya + yansaya to plain ya after a consonant.
 | ||
| [\u0D9A-\u0DC6] \u0DCA (\u200D)? { ය්ය → ය;
 | ||
| 
 | ||
| # Delete ZWNJ and ZWJ to simplify further processing.
 | ||
| \u200C → ;
 | ||
| \u200D → ;
 | ||
| 
 | ||
| # Insert a schwa after every consonant that is not followed by a dependent vowel
 | ||
| # or virama.
 | ||
| ::Null;
 | ||
| ([\u0D9A-\u0DC6]) } [^\u0DCA-\u0DDF \u0DF2\u0DF3] → $1 ə;
 | ||
| 
 | ||
| # Pronunciation rules proper.
 | ||
| ::Null;
 | ||
| 
 | ||
| # fප is an alternative spelling of ෆ.
 | ||
| # This occurs e.g. in ඩේවිඩ් කොපර්fපීල්ඩ් (David Copperfield)
 | ||
| # [see http://bradshawofthefuture.blogspot.com/2013/02/f.html].
 | ||
| [Ff]ප → f;
 | ||
| 
 | ||
| # zස is seemingly the only way to unambiguously indicate a voiced /z/ sound.
 | ||
| # This occurs in e.g. ඇල්zසයිම' රෝගය (Alzheimer's disease)
 | ||
| # [see https://si.wikipedia.org/wiki/ඇල්zසයිම%27_රෝගය]
 | ||
| # or in zසීබ්රා (zebra) [see https://si.wikipedia.org/wiki/zසීබ්රා].
 | ||
| [Zz]ස → z;
 | ||
| 
 | ||
| ං → ŋ;
 | ||
| o → ŋ;  # common substitution for anusvaraya
 | ||
| 
 | ||
| ඃ ([\u0D9A-\u0DC6]) → | $1 \u0DCA $1;  # TODO: check which consonants geminate
 | ||
| ඃ → h;
 | ||
| 
 | ||
| අ → a;
 | ||
| ආ → aː;
 | ||
| ඇ → æ;
 | ||
| ඈ → æː;
 | ||
| ඉ → i;
 | ||
| ඊ → iː;
 | ||
| උ → u;
 | ||
| ඌ → uː;
 | ||
| ඍ → ri;
 | ||
| ඎ → ruː;
 | ||
| ඏ → ilu;
 | ||
| ඐ → iluː;
 | ||
| එ → e;
 | ||
| ඒ → eː;
 | ||
| ඓ → aj;
 | ||
| ඔ → o;
 | ||
| ඕ → oː;
 | ||
| ඖ → aw;  # TODO: check if this is correct
 | ||
| 
 | ||
| ක → k;
 | ||
| ඛ → k;
 | ||
| ග → ɡ;
 | ||
| ඝ → ɡ;
 | ||
| ඞ → ŋ;
 | ||
| ඟ → ᵑɡ;
 | ||
| ච → c;
 | ||
| ඡ → c;
 | ||
| ජ → ɟ;
 | ||
| ඣ → ɟ;
 | ||
| ඤ → ɲ;
 | ||
| ඥ → kɲ;  # TODO: double-check
 | ||
| ඦ → ɟ;
 | ||
| ට → ʈ;
 | ||
| ඨ → ʈ;
 | ||
| ඩ → ɖ;
 | ||
| ඪ → ɖ;
 | ||
| ණ → n;
 | ||
| ඬ → ⁿɖ;
 | ||
| ත → t;
 | ||
| ථ → t;
 | ||
| ද → d;
 | ||
| ධ → d;
 | ||
| න → n;
 | ||
| ඳ → ⁿd;
 | ||
| ප → p;
 | ||
| ඵ → p;
 | ||
| බ → b;
 | ||
| භ → b;
 | ||
| ම → m;
 | ||
| ඹ → ᵐb;
 | ||
| ය → j;
 | ||
| ර → r;
 | ||
| ල → l;
 | ||
| ව → w;
 | ||
| ශ → ʃ;
 | ||
| ෂ → ʃ;
 | ||
| ස → s;
 | ||
| හ → h;
 | ||
| ළ → l;
 | ||
| ෆ → f;
 | ||
| 
 | ||
| \u0DCA → ;  # delete virama
 | ||
| 
 | ||
| ා → aː;
 | ||
| ැ → æ;
 | ||
| ෑ → æː;
 | ||
| \u0DD2 → i;
 | ||
| \u0DD3 → iː;
 | ||
| \u0DD4 → u;
 | ||
| \u0DD6 → uː;
 | ||
| ෘ → ru;
 | ||
| ෙ → e;
 | ||
| ේ → eː;
 | ||
| ෛ → aj;
 | ||
| ො → o;
 | ||
| ෝ → oː;
 | ||
| ෞ → aw;  # TODO: check if this is correct
 | ||
| ෟ → lu;
 | ||
| ෲ → ruː;
 | ||
| ෳ → luː;
 | ||
| 
 | ||
| # Heuristics for turning /ə/ into /a/. Based on [1].
 | ||
| 
 | ||
| $c=[k ɡ ŋ {ᵑɡ} c ɟ ɲ ʈ ɖ {ⁿɖ} t d n {ⁿd} p b m {ᵐb} j r l w ʃ s z h f];
 | ||
| 
 | ||
| $s=[:^L:];
 | ||
| 
 | ||
| # Rule #1
 | ||
| ::Null;
 | ||
| $s sv    { ə      → ə;  # exception (a)
 | ||
| $s k     { ə } r  → ə;  # exception (b)
 | ||
| $s $c    { ə } $s → ə;  # exception (c)
 | ||
| $s $c $c { ə      → a;
 | ||
| $s $c    { ə      → a;
 | ||
| 
 | ||
| # Rule #2
 | ||
| ::Null;
 | ||
| $c r { ə } $c → a;  # clause (a) and (b)
 | ||
| $c r { a } h  → a;  # clause (d), exception
 | ||
| $c r { a } $c → ə;  # clause (c)
 | ||
| 
 | ||
| # Rule #3
 | ||
| # The paper is unclear about what this rule means. The interpretation here
 | ||
| # assumes that "preceded" in the paper is a typo and should be read "followed".
 | ||
| ::Null;
 | ||
| [a e æ o ə] h { ə → a;
 | ||
| 
 | ||
| # Rules #4 through #7
 | ||
| ::Null;
 | ||
|     ə } $c $c     → a;  # Rule #4
 | ||
|     ə } [rbɖʈ] $s → ə;  # Rule #5 exception
 | ||
|     ə } $c     $s → a;  # Rule #5
 | ||
|     ə } ji     $s → a;  # Rule #6
 | ||
| k { ə } [rl] u    → a;  # Rule #7
 | ||
| 
 | ||
| # Rule #8
 | ||
| # Note that the paper doesn't say explicitly that this rule should be
 | ||
| # anchored at the beginning of a word, but the remarks before the rules
 | ||
| # seem to imply this.
 | ||
| ::Null;
 | ||
| $s k { a } l[aeo]ːj   → ə;  # Typo in paper: /j/ was /y/.
 | ||
| $s k { a } le[mh][ui] → ə;
 | ||
| $s k { alə } h[ui]    → əle;
 | ||
| $s k { a } lə         → ə;
 | ||
| 
 | ||
| # Diphthongs
 | ||
| ::Null;
 | ||
| www+ → ww;  # යෞව්වන
 | ||
| [i {iː} e {eː} æ {æː} o {oː} a {aː}] { wu → w;
 | ||
| 
 | ||
| əji → aj;
 | ||
| iji → iː;  # perhaps: ij
 | ||
| [u {uː} e {eː} æ {æː} o {oː} a {aː}] { ji → j;
 | ||
| 			]]></tRule>
 | ||
| 		</transform>
 | ||
| 	</transforms>
 | ||
| </supplementalData>
 |