279 lines
		
	
	
		
			9.1 KiB
		
	
	
	
		
			Python
		
	
	
		
			Executable File
		
	
	
			
		
		
	
	
			279 lines
		
	
	
		
			9.1 KiB
		
	
	
	
		
			Python
		
	
	
		
			Executable File
		
	
	
| #!/usr/bin/python
 | |
| # -*- coding: utf-8 -*-
 | |
| #
 | |
| # created on: 2013jun05
 | |
| # created by: Markus W. Scherer
 | |
| 
 | |
| """Converts CLDR collation files from XML syntax to ICU syntax.
 | |
| 
 | |
| Handles the CLDR collation data in the post-CLDR 23 trunk in 2013 June.
 | |
| Preserves indentation (except where it joins lines) and text vs. NCR etc.
 | |
| Does not handle arbitrary LDML XML collation syntax."""
 | |
| 
 | |
| # Invoke with two arguments:
 | |
| # - the source folder path
 | |
| # - the destination folder path
 | |
| # For example:
 | |
| # ~/svn.cldr$ collicu/tools/scripts/coll2icu.py trunk/common/collation collicu/common/collation
 | |
| 
 | |
| import codecs
 | |
| import glob
 | |
| import os.path
 | |
| import sys
 | |
| 
 | |
| def GetIndent(s):
 | |
|   for i in range(len(s)):
 | |
|     if s[i] not in " \t": return s[:i]
 | |
|   return s
 | |
| 
 | |
| 
 | |
| # substring replacements
 | |
| replacements = (
 | |
|   # White space and syntax characters must be quoted.
 | |
|   # Using '\\u0020' rather than just ' ' for clarity.
 | |
|   ("<reset> </reset>", "&'\\u0020'"),  # can't just replace all "> <"
 | |
|   (">!<", ">'!'<"),
 | |
|   ('>"<', ">'\\\"'<"),
 | |
|   (">"<", ">'\\\"'<"),
 | |
|   (">#<", ">'\\u0023'<"),
 | |
|   (">$<", ">'$'<"),
 | |
|   (">%<", ">'%'<"),
 | |
|   (">&<", ">'&'<"),
 | |
|   (">&<", ">'&'<"),
 | |
|   (">'<", ">''<"),
 | |
|   (">'<", ">''<"),
 | |
|   (">(<", ">'('<"),
 | |
|   (">)<", ">')'<"),
 | |
|   (">*<", ">'*'<"),
 | |
|   (">+<", ">'+'<"),
 | |
|   (">,<", ">','<"),
 | |
|   (">-<", ">'-'<"),
 | |
|   (">.<", ">'.'<"),
 | |
|   (">/<", ">'/'<"),
 | |
|   (">:<", ">':'<"),
 | |
|   (">;<", ">';'<"),
 | |
|   ("><<", ">'<'<"),
 | |
|   (">=<", ">'='<"),
 | |
|   (">><", ">'>'<"),
 | |
|   (">?<", ">'?'<"),
 | |
|   (">@<", ">'@'<"),
 | |
|   (">[<", ">'['<"),
 | |
|   (">\\<", ">'\\\\'<"),
 | |
|   (">]<", ">']'<"),
 | |
|   (">^<", ">'^'<"),
 | |
|   (">_<", ">'_'<"),
 | |
|   (">`<", ">'`'<"),
 | |
|   (">{<", ">'{'<"),
 | |
|   (">|<", ">'|'<"),
 | |
|   (">}<", ">'}'<"),
 | |
|   (">~<", ">'~'<"),
 | |
|   # ha.xml has the following
 | |
|   ("'y", "''y"),
 | |
|   ("'Y", "''Y"),
 | |
|   # kl.xml has the following
 | |
|   ("K'", "K''"),
 | |
|   # not Pattern_White_Space, just obscure
 | |
|   (u"\u00A0", u"\\u00A0"),
 | |
|   (u"\u200C", u"\\u200C"),
 | |
|   (u"\u200D", u"\\u200D"),
 | |
|   (u"\u3000", u"\\u3000"),
 | |
|   # obscure, and some tools do not handle noncharacters well
 | |
|   (u"\uFDD0", u"'\\uFDD0'"),
 | |
|   # The old ICU collation rule parser seems to need more escaping than it should.
 | |
|   (u"≠", u"'≠'"),
 | |
|   # fi.xml resets contain a space
 | |
|   (u" ̵</reset>", u"'\\u0020'̵"),
 | |
|   # fa.xml <sc> with non-NFD_Inert chars
 | |
|   (u"<sc>\u0650\u064f\u064b\u064d\u064c</sc>", u"<<\u0650<<\u064f<<\u064b<<\u064d<<\u064c"),
 | |
|   # ml.xml strings contain spaces
 | |
|   (u" </s>", u"'\\u0020'"),
 | |
|   (u" </reset>", u"'\\u0020'"),
 | |
|   # vi.xml <sc> with non-NFD_Inert chars
 | |
|   (u"<sc>\u0309\u0303\u0301\u0323</sc>", u"<<\u0309<<\u0303<<\u0301<<\u0323"),
 | |
|   # en_US_POSIX needs a lot of quoting.
 | |
|   ("<pc> !"#$%&'()*+,-./</pc>", "<*'\\u0020'-'/'"),
 | |
|   ("<pc>0123456789:;<=>?@</pc>", "<*0-'@'"),
 | |
|   ("<pc>[\]^_`</pc>", "<*'['-'`'"),
 | |
|   ("<pc>{|}~</pc>", "<*'{'-'\u007F'"),
 | |
|   # CJK parenthesized resets
 | |
|   ("<reset>(", "&'('"),
 | |
|   (")</reset>", "')'"),
 | |
|   # Convert XML elements into ICU syntax.
 | |
|   ("><!--", "> #"),  # add a space before an inline comment
 | |
|   ("<!--", "#"),
 | |
|   (" -->", ""),
 | |
|   ("-->", ""),
 | |
|   ("<reset>", "&"),
 | |
|   ('<reset before="primary">', "&[before 1]"),
 | |
|   ('<reset before="secondary">', "&[before 2]"),
 | |
|   ('<reset before="tertiary">', "&[before 3]"),
 | |
|   ("</reset>", ""),
 | |
|   ("<p>", "<"),
 | |
|   ("</p>", ""),
 | |
|   ("<s>", "<<"),
 | |
|   ("</s>", ""),
 | |
|   ("<t>", "<<<"),
 | |
|   ("</t>", ""),
 | |
|   ("<i>", "="),
 | |
|   ("</i>", ""),
 | |
|   ("<pc>", "<*"),
 | |
|   ("</pc>", ""),
 | |
|   ("<sc>", "<<*"),
 | |
|   ("</sc>", ""),
 | |
|   ("<tc>", "<<<*"),
 | |
|   ("</tc>", ""),
 | |
|   ("<ic>", "=*"),
 | |
|   ("</ic>", ""),
 | |
|   ("<x>", ""),
 | |
|   ("</x>", ""),
 | |
|   ("<extend>", "/"),
 | |
|   ("</extend>", ""),
 | |
|   ("</context>", "|"),
 | |
|   ("<first_tertiary_ignorable/>", "[first tertiary ignorable]"),
 | |
|   ("<last_tertiary_ignorable/>", "[last tertiary ignorable]"),
 | |
|   ("<first_secondary_ignorable/>", "[first secondary ignorable]"),
 | |
|   ("<last_secondary_ignorable/>", "[last secondary ignorable]"),
 | |
|   ("<first_primary_ignorable/>", "[first primary ignorable]"),
 | |
|   ("<last_primary_ignorable/>", "[last primary ignorable]"),
 | |
|   ("<first_variable/>", "[first variable]"),
 | |
|   ("<last_variable/>", "[last variable]"),
 | |
|   ("<first_non_ignorable/>", "[first regular]"),
 | |
|   ("<last_non_ignorable/>", "[last regular]"),
 | |
|   ("<last_non_ignorable />", "[last regular]"),
 | |
|   ("<first_trailing/>", "[first trailing]"),
 | |
|   ("<last_trailing/>", "[last trailing]")
 | |
| )
 | |
| 
 | |
| 
 | |
| def ConvertFile(src, dest):
 | |
|   in_rules = False
 | |
|   partial = ""
 | |
|   in_ml_comment = False
 | |
|   for line in src:
 | |
|     if "<rules>" in line:
 | |
|       indent = GetIndent(line)
 | |
|       stripped = line.strip()
 | |
|       # Replace import-only rules with import elements.
 | |
|       if stripped == '<rules><import source="sr"/></rules>':
 | |
|         dest.write(indent + '<import source="sr"/>\n')
 | |
|       elif stripped == '<rules><import source="hr" type="search"/></rules>':
 | |
|         dest.write(indent + '<import source="hr" type="search"/>\n')
 | |
|       elif stripped == '<rules><import source="hr"/></rules>':
 | |
|         dest.write(indent + '<import source="hr"/>\n')
 | |
|       elif stripped == '<rules><import source="ps"/></rules>':
 | |
|         dest.write(indent + '<import source="ps"/>\n')
 | |
|       else:
 | |
|         # Replace the XML <rules> section with ICU syntax rules in <cr>.
 | |
|         assert stripped == "<rules>"
 | |
|         dest.write(indent + "<cr><![CDATA[\n")
 | |
|         in_rules = True
 | |
|     elif "</rules>" in line:
 | |
|       # Flush, and go back to just copying lines until the next <rules>.
 | |
|       if partial:
 | |
|         dest.write(partial + "\n")
 | |
|         partial = ""
 | |
|       in_ml_comment = False
 | |
|       dest.write(GetIndent(line) + "]]></cr>\n")
 | |
|       in_rules = False
 | |
|     else:
 | |
|       if in_rules:
 | |
|         # Find out whether we want to concatenate the current line
 | |
|         # with the previous and/or next one.
 | |
|         finish_partial = False  # Finish collected, partial input.
 | |
|         start_ml_comment = False  # Start of a multi-line comment.
 | |
|         stop_comment = False  # End of a comment, must terminate the line.
 | |
|         if ("<reset" in line) or line.lstrip().startswith("<!--"):
 | |
|           finish_partial = True
 | |
|         if partial and len(partial.strip()) > 80:
 | |
|           finish_partial = True
 | |
|         if "<!--" in line and "-->" not in line:
 | |
|           start_ml_comment = True
 | |
|         if "-->" in line:
 | |
|           assert line.rstrip().endswith("-->")
 | |
|           stop_comment = True
 | |
| 
 | |
|         # Convert XML syntax to ICU syntax.
 | |
|         if "<context>" in line:
 | |
|           # Swap context & relation:
 | |
|           #   <x><context>カ</context><i>ー</i></x>
 | |
|           # turns into
 | |
|           #   =カ|ー
 | |
|           if "<i>" in line:
 | |
|             line = line.replace("<i>", "").replace("<context>", "<i>")
 | |
|           elif "<t>" in line:
 | |
|             line = line.replace("<t>", "").replace("<context>", "<t>")
 | |
| 
 | |
|         for (xml, icu) in replacements:
 | |
|           line = line.replace(xml, icu)
 | |
| 
 | |
|         while True:
 | |
|           # Convert a Numeric Character Reference to \\uhhhh.
 | |
|           i = line.find("&#x")
 | |
|           if i < 0: break
 | |
|           limit = line.find(";", i + 3)
 | |
|           cp = line[i + 3:limit]
 | |
|           while len(cp) < 4: cp = "0" + cp
 | |
|           assert len(cp) == 4  # not handling supplementary code points
 | |
|           line = line[:i] + "\\u" + cp + line[limit + 1:]
 | |
| 
 | |
|         # Start/continue/finish concatenation, and output.
 | |
|         if partial and finish_partial:
 | |
|           # Write collected input.
 | |
|           dest.write(partial + "\n")
 | |
|           partial = ""
 | |
| 
 | |
|         if start_ml_comment:
 | |
|           # Start a multi-line comment.
 | |
|           assert not partial
 | |
|           comment_indent = GetIndent(line)  # can be the empty string
 | |
|           in_ml_comment = True
 | |
|         elif in_ml_comment:
 | |
|           # Continue a multi-line comment.
 | |
|           assert not partial
 | |
|           if line.startswith(comment_indent):
 | |
|             if line[len(comment_indent)] in " \t":
 | |
|               # Preserve further indentation.
 | |
|               line = comment_indent + "#" + line[len(comment_indent):]
 | |
|             else:
 | |
|               # Add a space after the #.
 | |
|               line = comment_indent + "# " + line[len(comment_indent):]
 | |
|           else:
 | |
|             # Indent at least as much as the first line.
 | |
|             line = line.lstrip()
 | |
|             if line:
 | |
|               line = comment_indent + "# " + line
 | |
|             else:
 | |
|               line = comment_indent + "#\n"
 | |
|         elif stop_comment:
 | |
|           # Just output the line, do not start collecting input.
 | |
|           # ICU-syntax comments end with the end of the line,
 | |
|           # do not append rules to them.
 | |
|           if partial:
 | |
|             line = partial + line.lstrip() + "\n"
 | |
|             partial = ""
 | |
|         elif not partial:
 | |
|           # Start collecting input.
 | |
|           partial = line.rstrip()
 | |
|         elif partial:
 | |
|           # Continue collecting input.
 | |
|           partial += line.strip()
 | |
| 
 | |
|         if stop_comment:
 | |
|           in_ml_comment = False
 | |
|       if not partial: dest.write(line)
 | |
| 
 | |
| 
 | |
| def main():
 | |
|   (src_root, dest_root) = sys.argv[1:3]
 | |
|   src_pattern = os.path.join(src_root, "*.xml")
 | |
|   for src_path in glob.iglob(src_pattern):
 | |
|     basename = os.path.basename(src_path)
 | |
|     dest_path = os.path.join(dest_root, basename)
 | |
|     with codecs.open(src_path, "r", "UTF-8") as src:
 | |
|       with codecs.open(dest_path, "w", "UTF-8") as dest:
 | |
|         ConvertFile(src, dest)
 | |
| 
 | |
| 
 | |
| if __name__ == "__main__":
 | |
|   main()
 |