Skip to content

Commit f9c6336

Browse files
committed
Support Unicode properties Identifier_(Status|Type)
These non-UCD properties are now being asked to be supported by the Unicode regular expression specification, UTS #18 These have a slightly different header syntax for giving the version than UCD files. In this commit, I modify these to fit, but will probably have to generalize at some point the parsing of versions in mktables.
1 parent ac92cca commit f9c6336

File tree

8 files changed

+24717
-8273
lines changed

8 files changed

+24717
-8273
lines changed

charclass_invlists.h

Lines changed: 16373 additions & 15 deletions
Large diffs are not rendered by default.

lib/unicore/IdStatus.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
# IdentifierStatus.txt
1+
# IdentifierStatus-13.0.0.txt
22
# Date: 2019-10-22, 13:05:26 GMT
33
# © 2019 Unicode®, Inc.
44
# Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries.

lib/unicore/IdType.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
# IdentifierType.txt
1+
# IdentifierType-13.0.0.txt
22
# Date: 2019-11-05, 08:05:20 GMT
33
# © 2019 Unicode®, Inc.
44
# Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries.

lib/unicore/mktables

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1468,6 +1468,7 @@ my $DI; # Default_Ignorable_Code_Point property
14681468
my $NChar; # Noncharacter_Code_Point property
14691469
my $script;
14701470
my $scx; # Script_Extensions property
1471+
my $idt; # Identifier_Type property
14711472

14721473
# Are there conflicting names because of beginning with 'In_', or 'Is_'
14731474
my $has_In_conflicts = 0;
@@ -13546,6 +13547,36 @@ sub filter_emojidata_line {
1354613547
return;
1354713548
}
1354813549

13550+
sub setup_IdStatus {
13551+
my $ids = Property->new('Identifier_Status',
13552+
Match_SubDir => 'IdStatus',
13553+
Default_Map => 'Restricted',
13554+
);
13555+
$ids->add_match_table('Allowed');
13556+
}
13557+
13558+
sub setup_IdType {
13559+
$idt = Property->new('Identifier_Type',
13560+
Match_SubDir => 'IdType',
13561+
Default_Map => 'Not_Character',
13562+
Format => $STRING_WHITE_SPACE_LIST,
13563+
);
13564+
}
13565+
13566+
sub filter_IdType_line {
13567+
13568+
# Some code points have more than one type, separated by spaces on the
13569+
# input. For now, we just add everything as a property value. Later when
13570+
# we look for properties with format $STRING_WHITE_SPACE_LIST, we resolve
13571+
# things
13572+
13573+
my @fields = split /\s*;\s*/;
13574+
my $types = $fields[1];
13575+
$idt->add_match_table($types) unless defined $idt->table($types);
13576+
13577+
return;
13578+
}
13579+
1354913580
sub generate_hst {
1355013581

1355113582
# Populates the Hangul Syllable Type property from first principles
@@ -20133,6 +20164,15 @@ my @input_file_objects = (
2013320164
Input_file->new("$EMOJI/emoji.txt", v13.0.0,
2013420165
Has_Missings_Defaults => $NOT_IGNORED,
2013520166
),
20167+
Input_file->new('IdStatus.txt', v13.0.0,
20168+
Pre_Handler => \&setup_IdStatus,
20169+
Property => 'Identifier_Status',
20170+
),
20171+
Input_file->new('IdType.txt', v13.0.0,
20172+
Pre_Handler => \&setup_IdType,
20173+
Each_Line_Handler => \&filter_IdType_line,
20174+
Property => 'Identifier_Type',
20175+
),
2013620176
);
2013720177

2013820178
# End of all the preliminaries.

0 commit comments

Comments
 (0)