From 6298d7d40c8a9891527bcb2084fef1a0e0a90cc1 Mon Sep 17 00:00:00 2001 From: Markus Scherer Date: Fri, 27 Feb 2026 16:31:34 -0800 Subject: [PATCH] 11. unidata-18.0.0d11.txt A very minor update to the input file, changing the synthetic decomp for 1DF1F from <0064, 0064, F8F0> to <0064, 00F0>. That is the d-eth digraph, and the shorter decomposition is more correct, but was precluded originally by a bug in how the sifter tested for infinite recursion in decompositions. I have now corrected that test for infinite recursion, so the sifter now allows decompositions of the sort: x -> a, b b -> a, c as long as "a" is itself atomic and does not decompose further. Corresponding change to the sifter code is also posted: unisift.c No generated output files are posted for this delta, because both allkeys.txt and decomps.txt produce the same output as for delta 10, except for the generation date. --- c/uca/sifter/unidata.txt | 6 +++--- c/uca/sifter/unisift.c | 38 +++++++++++++++++++++++++++++++++----- 2 files changed, 36 insertions(+), 8 deletions(-) diff --git a/c/uca/sifter/unidata.txt b/c/uca/sifter/unidata.txt index 415607d60..ff08f2c3a 100644 --- a/c/uca/sifter/unidata.txt +++ b/c/uca/sifter/unidata.txt @@ -1,5 +1,5 @@ # unidata-18.0.0.txt -# Date: 2026-02-21, 00:00:00 GMT [KW] +# Date: 2026-02-26, 00:00:00 GMT [KW] # © 2026 Unicode®, Inc. # Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries. # For terms of use and license, see https://www.unicode.org/copyright.html @@ -10,7 +10,7 @@ # Default Unicode Collation Element Table (DUCET) for # the Unicode Collation Algorithm. # -# Version 18.0.0 draft 10 (Unicode Version: 18.0.0) +# Version 18.0.0 draft 11 (Unicode Version: 18.0.0) # based on Unicode data file UnicodeData-18.0.0d8.txt # Ordering for Unicode 18.0 # @@ -14962,7 +14962,7 @@ AB66;LATIN SMALL LETTER DZ DIGRAPH WITH RETROFLEX HOOK;Ll; 0064 0290;;;;; 10788;MODIFIER LETTER SMALL DZ DIGRAPH WITH RETROFLEX HOOK;Lm; AB66;;;;; 0238;LATIN SMALL LETTER DB DIGRAPH;Ll; 0064 0062;;;;; 1DFDE;MODIFIER LETTER SMALL DB DIGRAPH;Lm; 0238;;;;; -1DF1F;LATIN SMALL LETTER D-ETH DIGRAPH;Ll; 0064 0064 F8F0;;;;; +1DF1F;LATIN SMALL LETTER D-ETH DIGRAPH;Ll; 0064 00F0;;;;; 1DF20;LATIN SMALL LETTER D-LEZH DIGRAPH;Ll; 0064 026E;;;;; 1DF21;LATIN SMALL LETTER D-LEZH DIGRAPH WITH RETROFLEX HOOK;Ll; 0064 1DF05;;;;; diff --git a/c/uca/sifter/unisift.c b/c/uca/sifter/unisift.c index 4cf00058e..9eeca017a 100644 --- a/c/uca/sifter/unisift.c +++ b/c/uca/sifter/unisift.c @@ -64,6 +64,7 @@ * 2026-Feb-19 Add 208F to isAlphabeticException. * Add 1B168 to kana mapping in unisift_GetKatakanaBase. * 2026-Feb-21 Add implicit weights for Jurchen and Seal. + * 2026-Feb-26 Tweak test for infinite recursion in doRecursiveDecomp. */ /* @@ -188,7 +189,7 @@ #define PATHNAMELEN (256) #define LONGESTARG (256) -static char versionString[] = "Sifter version 18.0.0d2, 2026-02-21\n"; +static char versionString[] = "Sifter version 18.0.0d3, 2026-02-26\n"; static char unidatafilename[] = "unidata-18.0.0.txt"; static char allkeysfilename[] = "allkeys-18.0.0.txt"; @@ -1026,12 +1027,39 @@ UInt32 buf3[60]; /* temporary hold for constructed decomp */ printf ( "Bad Value: %s\n", sp ); break; } + /* + * Check if the token is the same as the scalarValue + * passed in. If this token itself has a non-trivial + * decomposition, this can lead to infinite recursion. + * Test that case and bail out if found. + * + * If cc is Atomic (or Implicit), let this just fall through. + * That allows for cases like: + * x -> a b + * b -> a c ==> x -> a a c + * + * This condition is not encountered for the normative + * decompositions, but might be encountered in synthetic + * decompositions added to unidata.txt for collation if + * not properly constructed. + */ if ( cc == scalarValue ) { - badValues++; - printf ( "Infinite recursion: %08X %s\n", scalarValue, - sp ); - return ( 0 ); + WALNUTPTR t1; + + t1 = getSiftDataPtr ( cc ); + if ( t1 == NULL ) + { + printf ( "Bad decomposition: %04X\n", cc ); + return ( 0 ); + } + if ( ( t1->decompType != Atomic ) && ( t1->decompType != Implicit ) ) + { + badValues++; + printf ( "Infinite recursion: %08X %s\n", scalarValue, + sp ); + return ( 0 ); + } } buf[numCharTokens] = cc; numCharTokens++;