Skip to content

Commit e16b83f

Browse files
authored
[5.7] [stdlib] Add Unicode script data to scalar properties (#59201)
* Merge pull request #42227 from Azoy/add-script-data [stdlib] Add Unicode script data to scalar properties * Add Script and Script Extension tests (#59194)
1 parent 6d62e71 commit e16b83f

File tree

9 files changed

+4568
-2
lines changed

9 files changed

+4568
-2
lines changed

stdlib/private/StdlibUnicodeUnittest/UnicodeScalarProperties.swift

Lines changed: 432 additions & 0 deletions
Large diffs are not rendered by default.

stdlib/public/SwiftShims/UnicodeData.h

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -58,7 +58,6 @@ __swift_uint32_t _swift_stdlib_getComposition(__swift_uint32_t x,
5858
// Grapheme Breaking
5959
//===----------------------------------------------------------------------===//
6060

61-
6261
SWIFT_RUNTIME_STDLIB_INTERNAL
6362
__swift_uint8_t _swift_stdlib_getGraphemeBreakProperty(__swift_uint32_t scalar);
6463

@@ -101,6 +100,14 @@ __swift_uint16_t _swift_stdlib_getAge(__swift_uint32_t scalar);
101100
SWIFT_RUNTIME_STDLIB_INTERNAL
102101
__swift_uint8_t _swift_stdlib_getGeneralCategory(__swift_uint32_t scalar);
103102

103+
SWIFT_RUNTIME_STDLIB_INTERNAL
104+
__swift_uint8_t _swift_stdlib_getScript(__swift_uint32_t scalar);
105+
106+
SWIFT_RUNTIME_STDLIB_INTERNAL
107+
const __swift_uint8_t *_swift_stdlib_getScriptExtensions(
108+
__swift_uint32_t scalar,
109+
__swift_uint8_t *count);
110+
104111
#ifdef __cplusplus
105112
} // extern "C"
106113
#endif

stdlib/public/core/UnicodeSPI.swift

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,8 @@
1010
//
1111
//===----------------------------------------------------------------------===//
1212

13+
import SwiftShims
14+
1315
//===----------------------------------------------------------------------===//
1416
// Unicode.NFD
1517
//===----------------------------------------------------------------------===//
@@ -123,3 +125,43 @@ extension Substring {
123125
Unicode._NFC(base: unicodeScalars)
124126
}
125127
}
128+
129+
//===----------------------------------------------------------------------===//
130+
// Unicode.Script
131+
//===----------------------------------------------------------------------===//
132+
133+
extension Unicode.Scalar.Properties {
134+
@_spi(_Unicode)
135+
@available(SwiftStdlib 5.7, *)
136+
public var _script: UInt8 {
137+
let rawValue = _swift_stdlib_getScript(_scalar.value)
138+
139+
_internalInvariant(rawValue != .max, "Unknown script rawValue")
140+
141+
return rawValue
142+
}
143+
144+
@_spi(_Unicode)
145+
@available(SwiftStdlib 5.7, *)
146+
public var _scriptExtensions: [UInt8] {
147+
var count: UInt8 = 0
148+
let pointer = _swift_stdlib_getScriptExtensions(_scalar.value, &count)
149+
150+
guard let pointer = pointer else {
151+
return [_script]
152+
}
153+
154+
var result: [UInt8] = []
155+
result.reserveCapacity(Int(count))
156+
157+
for i in 0 ..< count {
158+
let rawValue = pointer[Int(i)]
159+
160+
_internalInvariant(rawValue != .max, "Unknown script rawValue")
161+
162+
result.append(rawValue)
163+
}
164+
165+
return result
166+
}
167+
}

stdlib/public/stubs/Unicode/Common/ScriptData.h

Lines changed: 335 additions & 0 deletions
Large diffs are not rendered by default.

stdlib/public/stubs/Unicode/UnicodeData.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -115,7 +115,7 @@ __swift_intptr_t _swift_stdlib_getScalarBitArrayIdx(__swift_uint32_t scalar,
115115

116116
// If our chunk index is larger than the quick look indices, then it means
117117
// our scalar appears in chunks who are all 0 and trailing.
118-
if ((__swift_uint64_t) idx > quickLookSize) {
118+
if ((__swift_uint64_t) idx > quickLookSize - 1) {
119119
return std::numeric_limits<__swift_intptr_t>::max();
120120
}
121121

stdlib/public/stubs/Unicode/UnicodeScalarProps.cpp

Lines changed: 83 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,8 @@
1818
#include "Common/ScalarPropsData.h"
1919
#endif
2020

21+
#include "Common/ScriptData.h"
22+
2123
#else
2224
#include "swift/Runtime/Debug.h"
2325
#endif
@@ -427,3 +429,84 @@ __swift_uint8_t _swift_stdlib_getGeneralCategory(__swift_uint32_t scalar) {
427429
return std::numeric_limits<__swift_uint8_t>::max();
428430
#endif
429431
}
432+
433+
SWIFT_RUNTIME_STDLIB_INTERNAL
434+
__swift_uint8_t _swift_stdlib_getScript(__swift_uint32_t scalar) {
435+
#if !SWIFT_STDLIB_ENABLE_UNICODE_DATA
436+
swift::swift_abortDisabledUnicodeSupport();
437+
#else
438+
auto lowerBoundIndex = 0;
439+
auto endIndex = SCRIPTS_COUNT;
440+
auto upperBoundIndex = endIndex - 1;
441+
442+
while (upperBoundIndex >= lowerBoundIndex) {
443+
auto index = lowerBoundIndex + (upperBoundIndex - lowerBoundIndex) / 2;
444+
445+
auto entry = _swift_stdlib_scripts[index];
446+
447+
// Shift the enum value out of the scalar.
448+
auto lowerBoundScalar = (entry << 11) >> 11;
449+
450+
__swift_uint32_t upperBoundScalar = 0;
451+
452+
// If we're not at the end of the array, the range count is simply the
453+
// distance to the next element.
454+
if (index != endIndex - 1) {
455+
auto nextEntry = _swift_stdlib_scripts[index + 1];
456+
457+
auto nextLower = (nextEntry << 11) >> 11;
458+
459+
upperBoundScalar = nextLower - 1;
460+
} else {
461+
// Otherwise, the range count is the distance to 0x10FFFF
462+
upperBoundScalar = 0x10FFFF;
463+
}
464+
465+
// Shift the scalar out and get the enum value.
466+
auto script = entry >> 21;
467+
468+
if (scalar >= lowerBoundScalar && scalar <= upperBoundScalar) {
469+
return script;
470+
}
471+
472+
if (scalar > upperBoundScalar) {
473+
lowerBoundIndex = index + 1;
474+
continue;
475+
}
476+
477+
if (scalar < lowerBoundScalar) {
478+
upperBoundIndex = index - 1;
479+
continue;
480+
}
481+
}
482+
483+
// If we make it out of this loop, then it means the scalar was not found at
484+
// all in the array. This should never happen because the array represents all
485+
// scalars from 0x0 to 0x10FFFF, but if somehow this branch gets reached,
486+
// return 255 to indicate a failure.
487+
return std::numeric_limits<__swift_uint8_t>::max();
488+
#endif
489+
}
490+
491+
SWIFT_RUNTIME_STDLIB_INTERNAL
492+
const __swift_uint8_t *_swift_stdlib_getScriptExtensions(__swift_uint32_t scalar,
493+
__swift_uint8_t *count) {
494+
#if !SWIFT_STDLIB_ENABLE_UNICODE_DATA
495+
swift::swift_abortDisabledUnicodeSupport();
496+
#else
497+
auto dataIdx = _swift_stdlib_getScalarBitArrayIdx(scalar,
498+
_swift_stdlib_script_extensions,
499+
_swift_stdlib_script_extensions_ranks);
500+
501+
// If we don't have an index into the data indices, then this scalar has no
502+
// script extensions
503+
if (dataIdx == std::numeric_limits<__swift_intptr_t>::max()) {
504+
return 0;
505+
}
506+
507+
auto scalarDataIdx = _swift_stdlib_script_extensions_data_indices[dataIdx];
508+
*count = scalarDataIdx >> 11;
509+
510+
return _swift_stdlib_script_extensions_data + (scalarDataIdx & 0x7FF);
511+
#endif
512+
}

0 commit comments

Comments
 (0)