Skip to content

[5.7] [stdlib] Add Unicode script data to scalar properties #59201

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Jun 1, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
432 changes: 432 additions & 0 deletions stdlib/private/StdlibUnicodeUnittest/UnicodeScalarProperties.swift

Large diffs are not rendered by default.

9 changes: 8 additions & 1 deletion stdlib/public/SwiftShims/UnicodeData.h
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,6 @@ __swift_uint32_t _swift_stdlib_getComposition(__swift_uint32_t x,
// Grapheme Breaking
//===----------------------------------------------------------------------===//


SWIFT_RUNTIME_STDLIB_INTERNAL
__swift_uint8_t _swift_stdlib_getGraphemeBreakProperty(__swift_uint32_t scalar);

Expand Down Expand Up @@ -101,6 +100,14 @@ __swift_uint16_t _swift_stdlib_getAge(__swift_uint32_t scalar);
SWIFT_RUNTIME_STDLIB_INTERNAL
__swift_uint8_t _swift_stdlib_getGeneralCategory(__swift_uint32_t scalar);

SWIFT_RUNTIME_STDLIB_INTERNAL
__swift_uint8_t _swift_stdlib_getScript(__swift_uint32_t scalar);

SWIFT_RUNTIME_STDLIB_INTERNAL
const __swift_uint8_t *_swift_stdlib_getScriptExtensions(
__swift_uint32_t scalar,
__swift_uint8_t *count);

#ifdef __cplusplus
} // extern "C"
#endif
Expand Down
42 changes: 42 additions & 0 deletions stdlib/public/core/UnicodeSPI.swift
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@
//
//===----------------------------------------------------------------------===//

import SwiftShims

//===----------------------------------------------------------------------===//
// Unicode.NFD
//===----------------------------------------------------------------------===//
Expand Down Expand Up @@ -123,3 +125,43 @@ extension Substring {
Unicode._NFC(base: unicodeScalars)
}
}

//===----------------------------------------------------------------------===//
// Unicode.Script
//===----------------------------------------------------------------------===//

extension Unicode.Scalar.Properties {
@_spi(_Unicode)
@available(SwiftStdlib 5.7, *)
public var _script: UInt8 {
let rawValue = _swift_stdlib_getScript(_scalar.value)

_internalInvariant(rawValue != .max, "Unknown script rawValue")

return rawValue
}

@_spi(_Unicode)
@available(SwiftStdlib 5.7, *)
public var _scriptExtensions: [UInt8] {
var count: UInt8 = 0
let pointer = _swift_stdlib_getScriptExtensions(_scalar.value, &count)

guard let pointer = pointer else {
return [_script]
}

var result: [UInt8] = []
result.reserveCapacity(Int(count))

for i in 0 ..< count {
let rawValue = pointer[Int(i)]

_internalInvariant(rawValue != .max, "Unknown script rawValue")

result.append(rawValue)
}

return result
}
}
335 changes: 335 additions & 0 deletions stdlib/public/stubs/Unicode/Common/ScriptData.h

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion stdlib/public/stubs/Unicode/UnicodeData.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -115,7 +115,7 @@ __swift_intptr_t _swift_stdlib_getScalarBitArrayIdx(__swift_uint32_t scalar,

// If our chunk index is larger than the quick look indices, then it means
// our scalar appears in chunks who are all 0 and trailing.
if ((__swift_uint64_t) idx > quickLookSize) {
if ((__swift_uint64_t) idx > quickLookSize - 1) {
return std::numeric_limits<__swift_intptr_t>::max();
}

Expand Down
83 changes: 83 additions & 0 deletions stdlib/public/stubs/Unicode/UnicodeScalarProps.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@
#include "Common/ScalarPropsData.h"
#endif

#include "Common/ScriptData.h"

#else
#include "swift/Runtime/Debug.h"
#endif
Expand Down Expand Up @@ -427,3 +429,84 @@ __swift_uint8_t _swift_stdlib_getGeneralCategory(__swift_uint32_t scalar) {
return std::numeric_limits<__swift_uint8_t>::max();
#endif
}

SWIFT_RUNTIME_STDLIB_INTERNAL
__swift_uint8_t _swift_stdlib_getScript(__swift_uint32_t scalar) {
#if !SWIFT_STDLIB_ENABLE_UNICODE_DATA
swift::swift_abortDisabledUnicodeSupport();
#else
auto lowerBoundIndex = 0;
auto endIndex = SCRIPTS_COUNT;
auto upperBoundIndex = endIndex - 1;

while (upperBoundIndex >= lowerBoundIndex) {
auto index = lowerBoundIndex + (upperBoundIndex - lowerBoundIndex) / 2;

auto entry = _swift_stdlib_scripts[index];

// Shift the enum value out of the scalar.
auto lowerBoundScalar = (entry << 11) >> 11;

__swift_uint32_t upperBoundScalar = 0;

// If we're not at the end of the array, the range count is simply the
// distance to the next element.
if (index != endIndex - 1) {
auto nextEntry = _swift_stdlib_scripts[index + 1];

auto nextLower = (nextEntry << 11) >> 11;

upperBoundScalar = nextLower - 1;
} else {
// Otherwise, the range count is the distance to 0x10FFFF
upperBoundScalar = 0x10FFFF;
}

// Shift the scalar out and get the enum value.
auto script = entry >> 21;

if (scalar >= lowerBoundScalar && scalar <= upperBoundScalar) {
return script;
}

if (scalar > upperBoundScalar) {
lowerBoundIndex = index + 1;
continue;
}

if (scalar < lowerBoundScalar) {
upperBoundIndex = index - 1;
continue;
}
}

// If we make it out of this loop, then it means the scalar was not found at
// all in the array. This should never happen because the array represents all
// scalars from 0x0 to 0x10FFFF, but if somehow this branch gets reached,
// return 255 to indicate a failure.
return std::numeric_limits<__swift_uint8_t>::max();
#endif
}

SWIFT_RUNTIME_STDLIB_INTERNAL
const __swift_uint8_t *_swift_stdlib_getScriptExtensions(__swift_uint32_t scalar,
__swift_uint8_t *count) {
#if !SWIFT_STDLIB_ENABLE_UNICODE_DATA
swift::swift_abortDisabledUnicodeSupport();
#else
auto dataIdx = _swift_stdlib_getScalarBitArrayIdx(scalar,
_swift_stdlib_script_extensions,
_swift_stdlib_script_extensions_ranks);

// If we don't have an index into the data indices, then this scalar has no
// script extensions
if (dataIdx == std::numeric_limits<__swift_intptr_t>::max()) {
return 0;
}

auto scalarDataIdx = _swift_stdlib_script_extensions_data_indices[dataIdx];
*count = scalarDataIdx >> 11;

return _swift_stdlib_script_extensions_data + (scalarDataIdx & 0x7FF);
#endif
}
Loading