Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions c/uca/sifter/unidata.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# unidata-18.0.0.txt
# Date: 2026-02-21, 00:00:00 GMT [KW]
# Date: 2026-02-26, 00:00:00 GMT [KW]
# © 2026 Unicode®, Inc.
# Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries.
# For terms of use and license, see https://www.unicode.org/copyright.html
Expand All @@ -10,7 +10,7 @@
# Default Unicode Collation Element Table (DUCET) for
# the Unicode Collation Algorithm.
#
# Version 18.0.0 draft 10 (Unicode Version: 18.0.0)
# Version 18.0.0 draft 11 (Unicode Version: 18.0.0)
# based on Unicode data file UnicodeData-18.0.0d8.txt
# Ordering for Unicode 18.0
#
Expand Down Expand Up @@ -14962,7 +14962,7 @@ AB66;LATIN SMALL LETTER DZ DIGRAPH WITH RETROFLEX HOOK;Ll;<sort> 0064 0290;;;;;
10788;MODIFIER LETTER SMALL DZ DIGRAPH WITH RETROFLEX HOOK;Lm;<super> AB66;;;;;
0238;LATIN SMALL LETTER DB DIGRAPH;Ll;<sort> 0064 0062;;;;;
1DFDE;MODIFIER LETTER SMALL DB DIGRAPH;Lm;<super> 0238;;;;;
1DF1F;LATIN SMALL LETTER D-ETH DIGRAPH;Ll;<sort> 0064 0064 F8F0;;;;;
1DF1F;LATIN SMALL LETTER D-ETH DIGRAPH;Ll;<sort> 0064 00F0;;;;;
1DF20;LATIN SMALL LETTER D-LEZH DIGRAPH;Ll;<sort> 0064 026E;;;;;
1DF21;LATIN SMALL LETTER D-LEZH DIGRAPH WITH RETROFLEX HOOK;Ll;<sort> 0064 1DF05;;;;;

Expand Down
38 changes: 33 additions & 5 deletions c/uca/sifter/unisift.c
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,7 @@
* 2026-Feb-19 Add 208F to isAlphabeticException.
* Add 1B168 to kana mapping in unisift_GetKatakanaBase.
* 2026-Feb-21 Add implicit weights for Jurchen and Seal.
* 2026-Feb-26 Tweak test for infinite recursion in doRecursiveDecomp.
*/

/*
Expand Down Expand Up @@ -188,7 +189,7 @@
#define PATHNAMELEN (256)
#define LONGESTARG (256)

static char versionString[] = "Sifter version 18.0.0d2, 2026-02-21\n";
static char versionString[] = "Sifter version 18.0.0d3, 2026-02-26\n";

static char unidatafilename[] = "unidata-18.0.0.txt";
static char allkeysfilename[] = "allkeys-18.0.0.txt";
Expand Down Expand Up @@ -1026,12 +1027,39 @@ UInt32 buf3[60]; /* temporary hold for constructed decomp */
printf ( "Bad Value: %s\n", sp );
break;
}
/*
* Check if the token is the same as the scalarValue
* passed in. If this token itself has a non-trivial
* decomposition, this can lead to infinite recursion.
* Test that case and bail out if found.
*
* If cc is Atomic (or Implicit), let this just fall through.
* That allows for cases like:
* x -> a b
* b -> a c ==> x -> a a c
*
* This condition is not encountered for the normative
* decompositions, but might be encountered in synthetic
* decompositions added to unidata.txt for collation if
* not properly constructed.
*/
if ( cc == scalarValue )
{
badValues++;
printf ( "Infinite recursion: %08X %s\n", scalarValue,
sp );
return ( 0 );
WALNUTPTR t1;

t1 = getSiftDataPtr ( cc );
if ( t1 == NULL )
{
printf ( "Bad decomposition: %04X\n", cc );
return ( 0 );
}
if ( ( t1->decompType != Atomic ) && ( t1->decompType != Implicit ) )
{
badValues++;
printf ( "Infinite recursion: %08X %s\n", scalarValue,
sp );
return ( 0 );
}
}
buf[numCharTokens] = cc;
numCharTokens++;
Expand Down
Loading