447 lines
14 KiB
447 lines
14 KiB
<?xml version="1.0" encoding="UTF-8"?>
<!-- Copyright (C) 2016 and later: Unicode, Inc. and others. License & terms of use: http://www.unicode.org/copyright.html -->
<!-- Copyright (c) 2007-2009 IBM Corporation and others. All rights reserved -->
<!-- Test data file for string search -->
<!DOCTYPE stringsearch-tests [
<!ELEMENT stringsearch-tests (test-case+)>
<!ATTLIST stringsearch-tests debug IDREF #IMPLIED >
<!ELEMENT test-case (pattern, pre?, m?, post?)>
<!ATTLIST test-case
locale CDATA "en"
norm (ON | OFF) "OFF"
<!ELEMENT pattern (#PCDATA)>
<!-- debug="test11" (for copying into the above element) -->
<!-- Very simple match -->
<test-case id="test01" >
<!-- Very simple no-match -->
<test-case id="test02" >
<!-- Match after several near-misses. -->
<test-case id="test03" >
<pre>silly spring stling strxng strilg strinx stri</pre><m>string</m><post> fling</post>
<test-case id="test04" strength="PRIMARY" >
<test-case id="test05" strength="PRIMARY" >
<test-case id="test05.5" strength="PRIMARY" >
<pre>a </pre>
<post>ball table</post>
<test-case id="test06" strength="PRIMARY" >
<test-case id="test07" strength="SECONDARY" >
<test-case id="test08" strength="PRIMARY" >
<!-- A good match following an initial match that failed because
of not ending on a character boundary -->
<test-case id="test09" strength="PRIMARY">
<pre>fuß </pre><m>fus</m><post>sss</post>
<!-- Test cases from usrchdat.c BREAKITERATOREXACT -->
<test-case id="test10" strength="TERTIARY">
<m>fox</m><post>y fox</post>
<test-case id="test11" strength="PRIMARY" locale="de_DE@collation=phonebook">
<pre>This is a </pre><m>Tö</m><post>ne</post>
<test-case id="test11a" strength="SECONDARY" locale="de_DE@collation=phonebook">
<pre>This is a </pre><post>Töne</post>
<test-case id="test12" strength="TERTIARY">
<pre>tésting that é doés not match </pre><m>e</m><post></post>
<test-case id="test13" strength="PRIMARY" locale="fr">
<test-case id="test14" strength="PRIMARY" locale="fr">
<!-- Test cases from usrchdat.c STRENGTH -->
<test-case id="test15" strength="PRIMARY" locale="en">
<pre>The quick brown </pre><m>fox</m><post> jumps over the lazy foxes</post>
<test-case id="test16" strength="PRIMARY" locale="fr">
<pre>blackbirds pat </pre><m>p\u00E9ch\u00E9</m><post> </post>
<test-case id="test17" strength="PRIMARY" locale="fr">
<pre>blackbirds pat </pre><m>p\u00EAche</m><post> </post>
<test-case id="test18" strength="PRIMARY" locale="fr">
<pre>blackbirds pat </pre><m>p\u00E9che</m><post>r </post>
<test-case id="test19" strength="PRIMARY" locale="fr">
<pre>blackbirds pat </pre><m>p\u00EAche</m><post>r </post>
<test-case id="test20" strength="PRIMARY" locale="es">
<pre>A </pre><m>channel</m><post>, </post>
<test-case id="test21" strength="PRIMARY" locale="es">
<pre>A </pre><m>CHANNEL</m><post>, </post>
<test-case id="test22" strength="PRIMARY" locale="es">
<pre>A </pre><m>Channel</m><post>s, </post>
<test-case id="test23" strength="PRIMARY" locale="es">
<pre>A </pre><m>channel</m><post>... </post>
<test-case id="test24" strength="TERTIARY" locale="en">
<pre>A miss, and then </pre><m>\u00c0</m><post> should match but not A"</post>
<!-- TODO: In the original test data, this test matched at IDENTICAL strength.
Doesn't seem right. The characters are different.
<test-case id="test24a" strength="IDENTICAL" locale="en">
<pre>At IDENTICAL, should this match? </pre><m>\u00c0</m><post></post>
<test-case id="test24b" strength="IDENTICAL" alternate_handling="SHIFTED" locale="en">
<pre>At IDENTICAL, should this match? </pre>
<test-case id="test25" strength="SECONDARY" locale="en">
<pre>12</pre><m>ű</m><post> Ű</post>
<test-case id="test26" strength="SECONDARY" locale="en">
<!-- Test Cases from usrchdat.c, VARIABLE -->
<test-case id="test27" strength="TERTIARY" locale="en">
<pre>black-bird </pre><m>blackbird</m><post>...</post>
<test-case id="test28" strength="TERTIARY" locale="en">
<pre> on</pre>
<!-- TODO: this gives an U_ILLEGAL_ARGUMENT error when opening
the UStringSearch. How did the original test run? -->
<test-case id="test29" strength="PRIMARY" locale="en">
<pattern> </pattern>
<test-case id="test30" strength="SECONDARY" locale="en">
<pre> a bc ab c a bc ab c"</pre>
<test-case id="test31" strength="SECONDARY" locale="en">
<pre> ---------------</pre>
<!-- Normalization test cases from usrchdat.c -->
<test-case id="test32" strength="TERTIARY" norm="ON">
<test-case id="test32a" strength="TERTIARY" norm="OFF">
<!-- COMPOSITEBOUNDARIES from usrchdat.c
Boundaries are not identical to original test data because
of matching only full combining sequences
<test-case id="test40" strength="TERTIARY">
<pre>À</pre> <!-- \u00C0 -->
<test-case id="test41" strength="TERTIARY">
<test-case id="test42" strength="TERTIARY">
<!-- SUPPLEMENTARYCANONICAL from usrchdat.c -->
<test-case id="test50" strength="TERTIARY">
<pre>abc \uD802\uDC00 \uD800\uDC01 \uD801\uDC00 </pre><m>\uD800\uDC00</m>
<post>abc abc\uD800\uDC00 \uD800\uD800\uDC00 \uD800\uDC00\uDC00</post>
<test-case id="test51" strength="TERTIARY">
<pre>and</pre><m>\\uD834\\uDDB9</m><post>this sentence</post>
<test-case id="test52" strength="TERTIARY">
<pattern> \\uD834\\uDDB9 </pattern>
<pre>and</pre><m> \\uD834\\uDDB9 </m><post>this sentence</post>
<test-case id="test53" strength="TERTIARY">
<pre>and</pre><m>-\\uD834\\uDDB9-</m><post>this sentence</post>
<test-case id="test54" strength="TERTIARY">
<pre>and</pre><m>,\\uD834\\uDDB9,</m><post>this sentence</post>
<test-case id="test55" strength="TERTIARY">
<pre>and</pre><m>?\\uD834\\uDDB9?</m><post>this sentence</post>
<!-- Long combining sequences -->
<!-- Backwards search fails because patterns ends w/ ignorables
<test-case id="test60" strength="PRIMARY">
<test-case id="test61" strength="TERTIARY">
<test-case id="test62" strength="TERTIARY">
<!-- stand-alone combining marks don't match attached marks -->
<test-case id="test63" strength="TERTIARY">
<test-case id="test64" strength="TERTIARY">
<!-- stand-alone combining mark does match an un-attached combining mark -->
<test-case id="test65" strength="TERTIARY">
<test-case id="test66" strength="TERTIARY">
<!-- stand-alone combining marks at end of the target text -->
<test-case id="test67" strength="TERTIARY">
<!-- attached combining marks at end of the target text, no match -->
<test-case id="test68" strength="TERTIARY">
<!-- no match within expansions at the start -->
<test-case id="test70" strength="PRIMARY">
<test-case id="test71" strength="PRIMARY">
<test-case id="test72" strength="PRIMARY">
<!-- unattached combining Tilde will not match a Tilde that is
part of a composed Ñ (\u00D1) -->
<test-case id="test73" strength="SECONDARY">
<pattern>\u0303</pattern> <!-- combining tilde -->
<test-case id="test74" strength="SECONDARY">
<pattern>\u0303</pattern> <!-- combining tilde -->
<test-case id="test75" strength="TERTIARY" locale="fr">
<test-case id="test76" strength="TERTIARY" locale="fr">
<test-case id="test77" strength="TERTIARY" locale="fr">
<!-- Test cases from ticket:5382 -->
<test-case id="test78" strength="SECONDARY" locale="hu_HU">
<test-case id="test79" strength="SECONDARY" locale="hu_HU">
<test-case id="test80" strength="SECONDARY" locale="hu_HU">
<!-- Test cases from ticket:5959 -->
<test-case id="test81" strength="SECONDARY">
<test-case id="test82" strength="SECONDARY">
<test-case id="test83" strength="IDENTICAL" alternate_handling="SHIFTED" locale="en">
<pattern>Universal Declaration of Human Rights</pattern>
<pre>Proclaims this </pre><m>Universal Declaration of Human Rights</m><post> as a common standard of achievement for all peoples and all nations</post>
<test-case id="test83b" strength="TERTIARY" alternate_handling="SHIFTED" locale="en">
<pattern>Universal Declaration of Human Rights</pattern>
<pre>Proclaims this </pre>
<post> as a common standard of achievement for all peoples and all nations</post>
<test-case id="test84" strength="TERTIARY" locale="en">
<test-case id="test84b" strength="IDENTICAL" locale="en">