chore: checkpoint before Python removal

This commit is contained in:
2026-03-26 22:33:59 +00:00
parent 683cec9307
commit e568ddf82a
29972 changed files with 11269302 additions and 2 deletions

File diff suppressed because one or more lines are too long

View File

@@ -0,0 +1,7 @@
{
"git": {
"sha1": "38a49da495248dd1ded84cf306e4ca42e64d5bb3",
"dirty": true
},
"path_in_vcs": "components/locale_core"
}

165
vendor/icu_locale_core/Cargo.lock generated vendored Normal file
View File

@@ -0,0 +1,165 @@
# This file is automatically @generated by Cargo.
# It is not intended for manual editing.
version = 3
[[package]]
name = "databake"
version = "0.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ff6ee9e2d2afb173bcdeee45934c89ec341ab26f91c9933774fc15c2b58f83ef"
dependencies = [
"databake-derive",
"proc-macro2",
"quote",
]
[[package]]
name = "databake-derive"
version = "0.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6834770958c7b84223607e49758ec0dde273c4df915e734aad50f62968a4c134"
dependencies = [
"proc-macro2",
"quote",
"syn",
"synstructure",
]
[[package]]
name = "displaydoc"
version = "0.2.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "97369cbbc041bc366949bc74d34658d6cda5621039731c6310521892a3a20ae0"
dependencies = [
"proc-macro2",
"quote",
"syn",
]
[[package]]
name = "icu_locale_core"
version = "2.1.1"
dependencies = [
"databake",
"displaydoc",
"litemap",
"serde",
"tinystr",
"writeable",
"zerovec",
]
[[package]]
name = "litemap"
version = "0.8.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6373607a59f0be73a39b6fe456b8192fcc3585f602af20751600e974dd455e77"
[[package]]
name = "proc-macro2"
version = "1.0.103"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5ee95bc4ef87b8d5ba32e8b7714ccc834865276eab0aed5c9958d00ec45f49e8"
dependencies = [
"unicode-ident",
]
[[package]]
name = "quote"
version = "1.0.41"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ce25767e7b499d1b604768e7cde645d14cc8584231ea6b295e9c9eb22c02e1d1"
dependencies = [
"proc-macro2",
]
[[package]]
name = "serde"
version = "1.0.228"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9a8e94ea7f378bd32cbbd37198a4a91436180c5bb472411e48b5ec2e2124ae9e"
dependencies = [
"serde_core",
"serde_derive",
]
[[package]]
name = "serde_core"
version = "1.0.228"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "41d385c7d4ca58e59fc732af25c3983b67ac852c1a25000afe1175de458b67ad"
dependencies = [
"serde_derive",
]
[[package]]
name = "serde_derive"
version = "1.0.228"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d540f220d3187173da220f885ab66608367b6574e925011a9353e4badda91d79"
dependencies = [
"proc-macro2",
"quote",
"syn",
]
[[package]]
name = "syn"
version = "2.0.108"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "da58917d35242480a05c2897064da0a80589a2a0476c9a3f2fdc83b53502e917"
dependencies = [
"proc-macro2",
"quote",
"unicode-ident",
]
[[package]]
name = "synstructure"
version = "0.13.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "728a70f3dbaf5bab7f0c4b1ac8d7ae5ea60a4b5549c8a5914361c99147a709d2"
dependencies = [
"proc-macro2",
"quote",
"syn",
]
[[package]]
name = "tinystr"
version = "0.8.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "42d3e9c45c09de15d06dd8acf5f4e0e399e85927b7f00711024eb7ae10fa4869"
dependencies = [
"displaydoc",
"serde_core",
"zerovec",
]
[[package]]
name = "unicode-ident"
version = "1.0.20"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "462eeb75aeb73aea900253ce739c8e18a67423fadf006037cd3ff27e82748a06"
[[package]]
name = "writeable"
version = "0.6.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9edde0db4769d2dc68579893f2306b26c6ecfbe0ef499b013d731b7b9247e0b9"
[[package]]
name = "zerofrom"
version = "0.1.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "50cc42e0333e05660c3587f3bf9d0478688e15d870fab3346451ce7f8c9fbea5"
[[package]]
name = "zerovec"
version = "0.11.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6c28719294829477f525be0186d13efa9a3c602f7ec202ca9e353d310fb9a002"
dependencies = [
"serde",
"zerofrom",
]

128
vendor/icu_locale_core/Cargo.toml vendored Normal file
View File

@@ -0,0 +1,128 @@
# THIS FILE IS AUTOMATICALLY GENERATED BY CARGO
#
# When uploading crates to the registry Cargo will automatically
# "normalize" Cargo.toml files for maximal compatibility
# with all versions of Cargo and also rewrite `path` dependencies
# to registry (e.g., crates.io) dependencies.
#
# If you are reading this file be aware that the original Cargo.toml
# will likely look very different (and much more reasonable).
# See Cargo.toml.orig for the original contents.
[package]
edition = "2021"
rust-version = "1.83"
name = "icu_locale_core"
version = "2.1.1"
authors = ["The ICU4X Project Developers"]
build = false
include = [
"data/**/*",
"src/**/*",
"examples/**/*",
"benches/**/*",
"tests/**/*",
"Cargo.toml",
"LICENSE",
"README.md",
"build.rs",
]
autolib = false
autobins = false
autoexamples = false
autotests = false
autobenches = false
description = "API for managing Unicode Language and Locale Identifiers"
homepage = "https://icu4x.unicode.org"
readme = "README.md"
categories = ["internationalization"]
license = "Unicode-3.0"
repository = "https://github.com/unicode-org/icu4x"
[package.metadata.docs.rs]
all-features = true
[features]
alloc = [
"litemap/alloc",
"tinystr/alloc",
"writeable/alloc",
"serde?/alloc",
]
databake = [
"dep:databake",
"alloc",
]
serde = [
"dep:serde",
"tinystr/serde",
]
zerovec = [
"dep:zerovec",
"tinystr/zerovec",
]
[lib]
name = "icu_locale_core"
path = "src/lib.rs"
bench = false
[[test]]
name = "langid"
path = "tests/langid.rs"
[[test]]
name = "locale"
path = "tests/locale.rs"
[[bench]]
name = "iai_langid"
path = "benches/iai_langid.rs"
harness = false
[[bench]]
name = "langid"
path = "benches/langid.rs"
harness = false
[[bench]]
name = "locale"
path = "benches/locale.rs"
harness = false
[[bench]]
name = "subtags"
path = "benches/subtags.rs"
harness = false
[dependencies.databake]
version = "0.2.0"
features = ["derive"]
optional = true
default-features = false
[dependencies.displaydoc]
version = "0.2.3"
default-features = false
[dependencies.litemap]
version = "0.8.0"
default-features = false
[dependencies.serde]
version = "1.0.220"
optional = true
default-features = false
[dependencies.tinystr]
version = "0.8.0"
default-features = false
[dependencies.writeable]
version = "0.6.0"
default-features = false
[dependencies.zerovec]
version = "0.11.3"
optional = true
default-features = false

46
vendor/icu_locale_core/LICENSE vendored Normal file
View File

@@ -0,0 +1,46 @@
UNICODE LICENSE V3
COPYRIGHT AND PERMISSION NOTICE
Copyright © 2020-2024 Unicode, Inc.
NOTICE TO USER: Carefully read the following legal agreement. BY
DOWNLOADING, INSTALLING, COPYING OR OTHERWISE USING DATA FILES, AND/OR
SOFTWARE, YOU UNEQUIVOCALLY ACCEPT, AND AGREE TO BE BOUND BY, ALL OF THE
TERMS AND CONDITIONS OF THIS AGREEMENT. IF YOU DO NOT AGREE, DO NOT
DOWNLOAD, INSTALL, COPY, DISTRIBUTE OR USE THE DATA FILES OR SOFTWARE.
Permission is hereby granted, free of charge, to any person obtaining a
copy of data files and any associated documentation (the "Data Files") or
software and any associated documentation (the "Software") to deal in the
Data Files or Software without restriction, including without limitation
the rights to use, copy, modify, merge, publish, distribute, and/or sell
copies of the Data Files or Software, and to permit persons to whom the
Data Files or Software are furnished to do so, provided that either (a)
this copyright and permission notice appear with all copies of the Data
Files or Software, or (b) this copyright and permission notice appear in
associated Documentation.
THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY
KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF
THIRD PARTY RIGHTS.
IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS NOTICE
BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES,
OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THE DATA
FILES OR SOFTWARE.
Except as contained in this notice, the name of a copyright holder shall
not be used in advertising or otherwise to promote the sale, use or other
dealings in these Data Files or Software without prior written
authorization of the copyright holder.
SPDX-License-Identifier: Unicode-3.0
Portions of ICU4X may have been adapted from ICU4C and/or ICU4J.
ICU 1.8.1 to ICU 57.1 © 1995-2016 International Business Machines Corporation and others.

55
vendor/icu_locale_core/README.md vendored Normal file
View File

@@ -0,0 +1,55 @@
# icu_locale_core [![crates.io](https://img.shields.io/crates/v/icu_locale_core)](https://crates.io/crates/icu_locale_core)
<!-- cargo-rdme start -->
Parsing, manipulating, and serializing Unicode Language and Locale Identifiers.
This module is published as its own crate ([`icu_locale_core`](https://docs.rs/icu_locale_core/latest/icu_locale_core/))
and as part of the [`icu`](https://docs.rs/icu/latest/icu/) crate. See the latter for more details on the ICU4X project.
The module provides algorithms for parsing a string into a well-formed language or locale identifier
as defined by [`UTS #35: Unicode LDML 3. Unicode Language and Locale Identifiers`]. Additionally
the module provides [`preferences`] interface for operations on locale preferences and conversions
from and to locale unicode extensions.
[`Locale`] is the most common structure to use for storing information about a language,
script, region, variants and extensions. In almost all cases, this struct should be used as the
base unit for all locale management operations.
[`LanguageIdentifier`] is a strict subset of [`Locale`] which can be useful in a narrow range of
cases where [`Unicode Extensions`] are not relevant.
If in doubt, use [`Locale`].
## Examples
```rust
use icu::locale::Locale;
use icu::locale::{
locale,
subtags::{language, region},
};
let mut loc: Locale = locale!("en-US");
assert_eq!(loc.id.language, language!("en"));
assert_eq!(loc.id.script, None);
assert_eq!(loc.id.region, Some(region!("US")));
assert_eq!(loc.id.variants.len(), 0);
loc.id.region = Some(region!("GB"));
assert_eq!(loc, locale!("en-GB"));
```
For more details, see [`Locale`] and [`LanguageIdentifier`].
[`UTS #35: Unicode LDML 3. Unicode Language and Locale Identifiers`]: https://unicode.org/reports/tr35/tr35.html#Unicode_Language_and_Locale_Identifiers
[`ICU4X`]: ../icu/index.html
[`Unicode Extensions`]: extensions
<!-- cargo-rdme end -->
## More Information
For more information on development, authorship, contributing etc. please visit [`ICU4X home page`](https://github.com/unicode-org/icu4x).

View File

@@ -0,0 +1,48 @@
{
"canonicalized": [
"en-US",
"en-GB",
"es-AR",
"it",
"zh-Hans-CN",
"de-AT",
"pl",
"fr-FR",
"de-AT",
"sr-Cyrl-SR",
"nb-NO",
"fr-FR",
"mk",
"uk",
"en-US",
"en-GB",
"es-AR",
"th",
"de",
"zh-Cyrl-HN",
"en-Latn-US"
],
"casing": [
"En_uS",
"EN-GB",
"ES-aR",
"iT",
"zH_HaNs_cN",
"dE-aT",
"Pl",
"FR-FR",
"de_AT",
"sR-CyrL_sr",
"NB-NO",
"fr_fr",
"Mk",
"uK",
"en-us",
"en_gb",
"ES-AR",
"tH",
"DE",
"ZH_cyrl_hN",
"eN-lAtN-uS"
]
}

View File

@@ -0,0 +1,26 @@
{
"canonicalized": [
"en-US-u-hc-h12",
"en-GB-u-ca-gregory-hc-h12",
"es-AR-x-private",
"th-u-ca-buddhist",
"de-u-co-phonebk-ka-shifted",
"ar-u-nu-native",
"ar-u-nu-latn",
"ja-t-it",
"ja-Kana-t-it",
"und-Latn-t-und-cyrl"
],
"casing": [
"en-US-U-hc-h12",
"en-GB-u-CA-gregory-hc-h12",
"es-AR-x-Private",
"th-u-ca-buDDhist",
"de-u-co-phonebk-KA-shifted",
"AR_U-NU-native",
"ar-u-nu-LaTN",
"jA-T-it",
"ja-kanA-T-IT",
"unD-Latn-T-und-cyrl"
]
}

View File

@@ -0,0 +1,28 @@
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
use serde::Deserialize;
#[derive(Deserialize)]
#[allow(dead_code)]
pub struct SubtagData {
pub valid: Vec<String>,
pub invalid: Vec<String>,
}
#[derive(Deserialize)]
#[allow(dead_code)]
pub struct Subtags {
pub language: SubtagData,
pub script: SubtagData,
pub region: SubtagData,
pub variant: SubtagData,
}
#[derive(Deserialize)]
#[allow(dead_code)]
pub struct LocaleList {
pub canonicalized: Vec<String>,
pub casing: Vec<String>,
}

View File

@@ -0,0 +1,18 @@
{
"language": {
"valid": ["en", "it", "pl", "de", "fr", "cs", "csb", "und", "ru", "nb", "NB", "UK", "pL", "Zh", "ES"],
"invalid": ["", "1", "$", "a1", "1211", "as_sa^a", "-0we", "3e3", "kk$$22", "testingaverylongstring"]
},
"script": {
"valid": ["Latn", "latn", "Arab", "xxxx", "Flan", "fAlA", "oOoO", "pPlQ", "esta", "RUSS"],
"invalid": ["", "1", "$", "a1", "1211", "assaa", "-0we", "3e3", "kk$$22", "testingaverylongstring"]
},
"region": {
"valid": ["DE", "321", "zh", "IA", "fN", "rU", "ru", "RU", "Ru", "CN", "AR"],
"invalid": ["", "1", "$", "a1", "1211", "assaa", "-0we", "3e3", "kk$$22", "testingaverylongstring"]
},
"variant": {
"valid": ["macos", "MaCoS", "windows", "posix", "POSIX", "Posix", "linux", "lINUX", "mAcOs", "testing", "WWWWWW"],
"invalid": ["", "1", "$", "a1", "a211", "ass__aa", "-0we", "3e3", "kk$$22", "testingaverylongstring"]
}
}

View File

@@ -0,0 +1,110 @@
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
#[macro_export]
macro_rules! overview {
($c:expr, $struct:ident, $data_str:expr, $compare:expr) => {
$c.bench_function("overview", |b| {
b.iter(|| {
let mut values = vec![];
for s in $data_str {
let value: Result<$struct, _> = black_box(s).parse();
values.push(value.expect("Parsing failed"));
}
let _ = values
.iter()
.filter(|&v| v.normalizing_eq($compare))
.count();
values
.iter()
.map(|v| v.to_string())
.collect::<Vec<String>>()
})
});
};
}
#[macro_export]
macro_rules! construct {
($c:expr, $struct:ident, $struct_name:expr, $data_str:expr) => {
$c.bench_function($struct_name, |b| {
b.iter(|| {
for s in $data_str {
let _: Result<$struct, _> = black_box(s).parse();
}
})
});
};
}
#[macro_export]
macro_rules! to_string {
($c:expr, $struct:ident, $struct_name:expr, $data:expr) => {
$c.bench_function($struct_name, |b| {
b.iter(|| {
for s in $data {
let _ = black_box(s).to_string();
}
})
});
$c.bench_function(std::concat!($struct_name, "/writeable"), |b| {
use writeable::Writeable;
b.iter(|| {
for s in $data {
let _ = black_box(s).write_to_string();
}
})
});
};
}
#[macro_export]
macro_rules! compare_struct {
($c:expr, $struct:ident, $struct_name:expr, $data1:expr, $data2:expr) => {
$c.bench_function(BenchmarkId::new("struct", $struct_name), |b| {
b.iter(|| {
for (lid1, lid2) in $data1.iter().zip($data2.iter()) {
let _ = black_box(lid1) == black_box(lid2);
}
})
});
};
}
#[macro_export]
macro_rules! compare_str {
($c:expr, $struct:ident, $struct_name:expr, $data1:expr, $data2:expr) => {
$c.bench_function(BenchmarkId::new("str", $struct_name), |b| {
b.iter(|| {
for (lid, s) in $data1.iter().zip($data2.iter()) {
let _ = black_box(lid).normalizing_eq(&black_box(s));
}
})
});
$c.bench_function(BenchmarkId::new("strict_cmp", $struct_name), |b| {
b.iter(|| {
for (lid, s) in $data1.iter().zip($data2.iter()) {
let _ = black_box(lid).strict_cmp(&black_box(s).as_str().as_bytes());
}
})
});
};
}
#[macro_export]
macro_rules! canonicalize {
($c:expr, $struct:ident, $struct_name:expr, $data:expr) => {
$c.bench_function($struct_name, |b| {
b.iter(|| {
for s in $data {
let _ = black_box(s).to_string();
}
for s in $data {
let _ = $struct::normalize(black_box(s));
}
})
});
};
}

View File

@@ -0,0 +1,5 @@
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
mod macros;

View File

@@ -0,0 +1,125 @@
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
use icu_locale_core::{langid, subtags::language, subtags::region, LanguageIdentifier};
use std::borrow::Cow;
use writeable::Writeable;
const LIDS: &[LanguageIdentifier] = &[
langid!("en"),
langid!("pl"),
langid!("fr-CA"),
langid!("zh-Hans"),
langid!("en-US"),
langid!("en-Latn-US"),
langid!("sr-Cyrl-BA"),
];
const LIDS_STR: &[&str] = &[
"en",
"pl",
"fr-CA",
"zh-Hans",
"en-US",
"en-Latn-US",
"sr-Cyrl-BA",
];
fn bench_langid_constr() {
// Tests the instructions required to construct a LID from an str.
let _: Vec<LanguageIdentifier> = LIDS_STR
.iter()
.map(|l| l.parse().expect("Failed to parse"))
.collect();
}
fn bench_langid_compare_components() {
// Tests the cost of comparing LID components.
let result = LIDS
.iter()
.filter(|l| l.language == language!("en") && l.region == Some(region!("US")))
.count();
assert_eq!(result, 2);
}
fn bench_langid_compare_components_str() {
// Tests the cost of comparing LID components to str.
let result = LIDS
.iter()
.filter(|l| {
l.language == language!("en") && l.region.map(|r| r == region!("US")).unwrap_or(false)
})
.count();
assert_eq!(result, 2);
}
fn bench_langid_strict_cmp() {
// Tests the cost of comparing a langid against byte strings.
use core::cmp::Ordering;
let lid = langid!("en-us");
let result = LIDS_STR
.iter()
.filter(|s| lid.strict_cmp(s.as_bytes()) == Ordering::Equal)
.count();
assert_eq!(result, 1);
}
fn bench_langid_matching() {
// Tests matching a LID against other LIDs.
let lid = langid!("en-us");
let count = LIDS.iter().filter(|l| lid == **l).count();
assert_eq!(count, 1);
}
fn bench_langid_matching_str() {
// Tests matching a LID against list of str.
let lid = langid!("en-us");
let count = LIDS_STR.iter().filter(|&l| lid.normalizing_eq(l)).count();
assert_eq!(count, 1);
}
fn bench_langid_serialize() {
// Tests serialization of LIDs.
let _: Vec<String> = LIDS.iter().map(|l| l.to_string()).collect();
}
fn bench_langid_serialize_writeable() {
// Tests serialization of LIDs.
let _: Vec<_> = LIDS.iter().map(|l| l.write_to_string()).collect();
}
fn bench_langid_canonicalize() {
// Tests canonicalization of strings.
let _: Vec<Cow<str>> = LIDS_STR
.iter()
.map(|l| LanguageIdentifier::normalize(l).expect("Normalization failed"))
.collect();
}
iai::main!(
bench_langid_constr,
bench_langid_compare_components,
bench_langid_compare_components_str,
bench_langid_strict_cmp,
bench_langid_matching,
bench_langid_matching_str,
bench_langid_serialize,
bench_langid_serialize_writeable,
bench_langid_canonicalize,
);

View File

@@ -0,0 +1,92 @@
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
mod fixtures;
mod helpers;
use criterion::{black_box, criterion_group, criterion_main, Criterion};
use icu_locale_core::LanguageIdentifier;
fn langid_benches(c: &mut Criterion) {
let data = serde_json::from_str::<fixtures::LocaleList>(include_str!("fixtures/langid.json"))
.expect("Failed to read a fixture");
// Overview
{
let mut group = c.benchmark_group("langid");
overview!(group, LanguageIdentifier, &data.canonicalized, "en-US");
group.finish();
}
{
use criterion::BenchmarkId;
// Construct
{
let mut group = c.benchmark_group("langid/construct");
construct!(group, LanguageIdentifier, "langid", &data.canonicalized);
group.finish();
}
// Stringify
{
let mut group = c.benchmark_group("langid/to_string");
let langids: Vec<LanguageIdentifier> = data
.canonicalized
.iter()
.map(|s| s.parse().unwrap())
.collect();
to_string!(group, LanguageIdentifier, "langid", &langids);
group.finish();
}
// Compare
{
let mut group = c.benchmark_group("langid/compare");
let langids: Vec<LanguageIdentifier> = data
.canonicalized
.iter()
.map(|s| s.parse().unwrap())
.collect();
let langids2: Vec<LanguageIdentifier> = data
.canonicalized
.iter()
.map(|s| s.parse().unwrap())
.collect();
compare_struct!(group, LanguageIdentifier, "langid", &langids, &langids2);
compare_str!(
group,
LanguageIdentifier,
"langid",
&langids,
&data.canonicalized
);
group.finish();
}
// Canonicalize
{
let mut group = c.benchmark_group("langid/canonicalize");
canonicalize!(group, LanguageIdentifier, "langid", &data.casing);
group.finish();
}
}
}
criterion_group!(benches, langid_benches,);
criterion_main!(benches);

View File

@@ -0,0 +1,86 @@
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
mod fixtures;
mod helpers;
use criterion::{black_box, criterion_group, criterion_main, Criterion};
use icu_locale_core::Locale;
fn locale_benches(c: &mut Criterion) {
let data = serde_json::from_str::<fixtures::LocaleList>(include_str!("fixtures/locale.json"))
.expect("Failed to read a fixture");
// Overview
{
let mut group = c.benchmark_group("locale");
overview!(group, Locale, &data.canonicalized, "en-US");
group.finish();
}
{
use criterion::BenchmarkId;
// Construct
{
let mut group = c.benchmark_group("locale/construct");
construct!(group, Locale, "locale", &data.canonicalized);
group.finish();
}
// Stringify
{
let mut group = c.benchmark_group("locale/to_string");
let locales: Vec<Locale> = data
.canonicalized
.iter()
.map(|s| s.parse().unwrap())
.collect();
to_string!(group, Locale, "locale", &locales);
group.finish();
}
// Compare
{
let mut group = c.benchmark_group("locale/compare");
let locales: Vec<Locale> = data
.canonicalized
.iter()
.map(|s| s.parse().unwrap())
.collect();
let locales2: Vec<Locale> = data
.canonicalized
.iter()
.map(|s| s.parse().unwrap())
.collect();
compare_struct!(group, Locale, "locale", &locales, &locales2);
compare_str!(group, Locale, "locale", &locales, &data.canonicalized);
group.finish();
}
// Canonicalize
{
let mut group = c.benchmark_group("locale/canonicalize");
canonicalize!(group, Locale, "locale", &data.casing);
group.finish();
}
}
}
criterion_group!(benches, locale_benches,);
criterion_main!(benches);

View File

@@ -0,0 +1,39 @@
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
mod fixtures;
mod helpers;
use criterion::{black_box, criterion_group, criterion_main, Criterion};
use icu_locale_core::subtags::{Language, Region, Script, Variant};
use icu_locale_core::ParseError;
macro_rules! subtag_bench {
($c:expr, $name:expr, $subtag:ident, $data:expr) => {
$c.bench_function(&format!("subtags/{}/parse", $name), |b| {
b.iter(|| {
for s in &$data.valid {
let _: $subtag = black_box(s).parse().unwrap();
}
for s in &$data.invalid {
let _: ParseError = black_box(s).parse::<$subtag>().unwrap_err();
}
})
});
};
}
fn subtags_bench(c: &mut Criterion) {
let data = serde_json::from_str::<fixtures::Subtags>(include_str!("fixtures/subtags.json"))
.expect("Failed to read a fixture");
subtag_bench!(c, "language", Language, data.language);
subtag_bench!(c, "script", Script, data.script);
subtag_bench!(c, "region", Region, data.region);
subtag_bench!(c, "variant", Variant, data.variant);
}
criterion_group!(benches, subtags_bench,);
criterion_main!(benches);

445
vendor/icu_locale_core/src/data.rs vendored Normal file
View File

@@ -0,0 +1,445 @@
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
use crate::extensions::unicode as unicode_ext;
use crate::subtags::{Language, Region, Script, Subtag, Variant};
#[cfg(feature = "alloc")]
use crate::ParseError;
use crate::{LanguageIdentifier, Locale};
use core::cmp::Ordering;
use core::default::Default;
use core::fmt;
use core::hash::Hash;
#[cfg(feature = "alloc")]
use core::str::FromStr;
/// A locale type optimized for use in fallbacking and the ICU4X data pipeline.
///
/// [`DataLocale`] contains less functionality than [`Locale`] but more than
/// [`LanguageIdentifier`] for better size and performance while still meeting
/// the needs of the ICU4X data pipeline.
///
/// You can create a [`DataLocale`] from a borrowed [`Locale`], which is more
/// efficient than cloning the [`Locale`], but less efficient than converting an owned
/// [`Locale`]:
///
/// ```
/// use icu_locale_core::locale;
/// use icu_provider::DataLocale;
///
/// let locale1 = locale!("en-u-ca-buddhist");
/// let data_locale = DataLocale::from(&locale1);
/// ```
///
/// [`DataLocale`] only supports `-u-sd` keywords, to reflect the current state of CLDR data
/// lookup and fallback. This may change in the future.
///
/// ```
/// use icu_locale_core::{locale, Locale};
/// use icu_provider::DataLocale;
///
/// let locale = "hi-IN-t-en-h0-hybrid-u-attr-ca-buddhist-sd-inas"
/// .parse::<Locale>()
/// .unwrap();
///
/// assert_eq!(
/// DataLocale::from(locale),
/// DataLocale::from(locale!("hi-IN-u-sd-inas"))
/// );
/// ```
#[derive(Clone, Copy, PartialEq, Hash, Eq)]
#[non_exhaustive]
pub struct DataLocale {
/// Language subtag
pub language: Language,
/// Script subtag
pub script: Option<Script>,
/// Region subtag
pub region: Option<Region>,
/// Variant subtag
pub variant: Option<Variant>,
/// Subivision (-u-sd-) subtag
pub subdivision: Option<Subtag>,
}
impl Default for DataLocale {
fn default() -> Self {
Self {
language: Language::UNKNOWN,
script: None,
region: None,
variant: None,
subdivision: None,
}
}
}
impl DataLocale {
/// `const` version of `Default::default`
pub const fn default() -> Self {
DataLocale {
language: Language::UNKNOWN,
script: None,
region: None,
variant: None,
subdivision: None,
}
}
}
impl Default for &DataLocale {
fn default() -> Self {
static DEFAULT: DataLocale = DataLocale::default();
&DEFAULT
}
}
impl fmt::Debug for DataLocale {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
write!(f, "DataLocale{{{self}}}")
}
}
impl_writeable_for_each_subtag_str_no_test!(DataLocale, selff, selff.script.is_none() && selff.region.is_none() && selff.variant.is_none() && selff.subdivision.is_none() => Some(selff.language.as_str()));
impl From<LanguageIdentifier> for DataLocale {
fn from(langid: LanguageIdentifier) -> Self {
Self::from(&langid)
}
}
impl From<Locale> for DataLocale {
fn from(locale: Locale) -> Self {
Self::from(&locale)
}
}
impl From<&LanguageIdentifier> for DataLocale {
fn from(langid: &LanguageIdentifier) -> Self {
Self {
language: langid.language,
script: langid.script,
region: langid.region,
variant: langid.variants.iter().copied().next(),
subdivision: None,
}
}
}
impl From<&Locale> for DataLocale {
fn from(locale: &Locale) -> Self {
let mut r = Self::from(&locale.id);
r.subdivision = locale
.extensions
.unicode
.keywords
.get(&unicode_ext::key!("sd"))
.and_then(|v| v.as_single_subtag().copied());
r
}
}
/// ✨ *Enabled with the `alloc` Cargo feature.*
#[cfg(feature = "alloc")]
impl FromStr for DataLocale {
type Err = ParseError;
#[inline]
fn from_str(s: &str) -> Result<Self, Self::Err> {
Self::try_from_str(s)
}
}
impl DataLocale {
#[inline]
/// Parses a [`DataLocale`].
///
/// ✨ *Enabled with the `alloc` Cargo feature.*
#[cfg(feature = "alloc")]
pub fn try_from_str(s: &str) -> Result<Self, ParseError> {
Self::try_from_utf8(s.as_bytes())
}
/// Parses a [`DataLocale`] from a UTF-8 byte slice.
///
/// ✨ *Enabled with the `alloc` Cargo feature.*
#[cfg(feature = "alloc")]
pub fn try_from_utf8(code_units: &[u8]) -> Result<Self, ParseError> {
let locale = Locale::try_from_utf8(code_units)?;
if locale.id.variants.len() > 1
|| !locale.extensions.transform.is_empty()
|| !locale.extensions.private.is_empty()
|| !locale.extensions.other.is_empty()
|| !locale.extensions.unicode.attributes.is_empty()
{
return Err(ParseError::InvalidExtension);
}
let unicode_extensions_count = locale.extensions.unicode.keywords.iter().count();
if unicode_extensions_count != 0
&& (unicode_extensions_count != 1
|| !locale
.extensions
.unicode
.keywords
.contains_key(&unicode_ext::key!("sd")))
{
return Err(ParseError::InvalidExtension);
}
Ok(locale.into())
}
pub(crate) fn for_each_subtag_str<E, F>(&self, f: &mut F) -> Result<(), E>
where
F: FnMut(&str) -> Result<(), E>,
{
f(self.language.as_str())?;
if let Some(ref script) = self.script {
f(script.as_str())?;
}
if let Some(ref region) = self.region {
f(region.as_str())?;
}
if let Some(ref single_variant) = self.variant {
f(single_variant.as_str())?;
}
if let Some(ref subdivision) = self.subdivision {
f("u")?;
f("sd")?;
f(subdivision.as_str())?;
}
Ok(())
}
fn as_tuple(
&self,
) -> (
Language,
Option<Script>,
Option<Region>,
Option<Variant>,
Option<Subtag>,
) {
(
self.language,
self.script,
self.region,
self.variant,
self.subdivision,
)
}
/// Returns an ordering suitable for use in [`BTreeSet`].
///
/// [`BTreeSet`]: alloc::collections::BTreeSet
pub fn total_cmp(&self, other: &Self) -> Ordering {
self.as_tuple().cmp(&other.as_tuple())
}
/// Compare this [`DataLocale`] with BCP-47 bytes.
///
/// The return value is equivalent to what would happen if you first converted this
/// [`DataLocale`] to a BCP-47 string and then performed a byte comparison.
///
/// This function is case-sensitive and results in a *total order*, so it is appropriate for
/// binary search. The only argument producing [`Ordering::Equal`] is `self.to_string()`.
///
/// # Examples
///
/// ```
/// use core::cmp::Ordering;
/// use icu_provider::DataLocale;
///
/// let bcp47_strings: &[&str] = &[
/// "ca",
/// "ca-ES",
/// "ca-ES-u-sd-esct",
/// "ca-ES-valencia",
/// "cat",
/// "pl-Latn-PL",
/// "und",
/// "und-fonipa",
/// "zh",
/// ];
///
/// for ab in bcp47_strings.windows(2) {
/// let a = ab[0];
/// let b = ab[1];
/// assert_eq!(a.cmp(b), Ordering::Less, "strings: {} < {}", a, b);
/// let a_loc: DataLocale = a.parse().unwrap();
/// assert_eq!(
/// a_loc.strict_cmp(a.as_bytes()),
/// Ordering::Equal,
/// "strict_cmp: {} == {}",
/// a_loc,
/// a
/// );
/// assert_eq!(
/// a_loc.strict_cmp(b.as_bytes()),
/// Ordering::Less,
/// "strict_cmp: {} < {}",
/// a_loc,
/// b
/// );
/// let b_loc: DataLocale = b.parse().unwrap();
/// assert_eq!(
/// b_loc.strict_cmp(b.as_bytes()),
/// Ordering::Equal,
/// "strict_cmp: {} == {}",
/// b_loc,
/// b
/// );
/// assert_eq!(
/// b_loc.strict_cmp(a.as_bytes()),
/// Ordering::Greater,
/// "strict_cmp: {} > {}",
/// b_loc,
/// a
/// );
/// }
/// ```
///
/// Comparison against invalid strings:
///
/// ```
/// use icu_provider::DataLocale;
///
/// let invalid_strings: &[&str] = &[
/// // Less than "ca-ES"
/// "CA",
/// "ar-x-gbp-FOO",
/// // Greater than "ca-AR"
/// "ca_ES",
/// "ca-ES-x-gbp-FOO",
/// ];
///
/// let data_locale = "ca-ES".parse::<DataLocale>().unwrap();
///
/// for s in invalid_strings.iter() {
/// let expected_ordering = "ca-AR".cmp(s);
/// let actual_ordering = data_locale.strict_cmp(s.as_bytes());
/// assert_eq!(expected_ordering, actual_ordering, "{}", s);
/// }
/// ```
pub fn strict_cmp(&self, other: &[u8]) -> Ordering {
writeable::cmp_utf8(self, other)
}
/// Returns whether this [`DataLocale`] is `und` in the locale and extensions portion.
///
/// # Examples
///
/// ```
/// use icu_provider::DataLocale;
///
/// assert!("und".parse::<DataLocale>().unwrap().is_unknown());
/// assert!(!"de-u-sd-denw".parse::<DataLocale>().unwrap().is_unknown());
/// assert!(!"und-ES".parse::<DataLocale>().unwrap().is_unknown());
/// ```
pub fn is_unknown(&self) -> bool {
self.language.is_unknown()
&& self.script.is_none()
&& self.region.is_none()
&& self.variant.is_none()
&& self.subdivision.is_none()
}
/// Converts this `DataLocale` into a [`Locale`].
pub fn into_locale(self) -> Locale {
Locale {
id: LanguageIdentifier {
language: self.language,
script: self.script,
region: self.region,
variants: self
.variant
.map(crate::subtags::Variants::from_variant)
.unwrap_or_default(),
},
extensions: {
let mut extensions = crate::extensions::Extensions::default();
if let Some(sd) = self.subdivision {
extensions.unicode = unicode_ext::Unicode {
keywords: unicode_ext::Keywords::new_single(
unicode_ext::key!("sd"),
unicode_ext::Value::from_subtag(Some(sd)),
),
..Default::default()
}
}
extensions
},
}
}
}
#[test]
fn test_data_locale_to_string() {
struct TestCase {
pub locale: &'static str,
pub expected: &'static str,
}
for cas in [
TestCase {
locale: "und",
expected: "und",
},
TestCase {
locale: "und-u-sd-sdd",
expected: "und-u-sd-sdd",
},
TestCase {
locale: "en-ZA-u-sd-zaa",
expected: "en-ZA-u-sd-zaa",
},
] {
let locale = cas.locale.parse::<DataLocale>().unwrap();
writeable::assert_writeable_eq!(locale, cas.expected);
}
}
#[test]
fn test_data_locale_from_string() {
#[derive(Debug)]
struct TestCase {
pub input: &'static str,
pub success: bool,
}
for cas in [
TestCase {
input: "und",
success: true,
},
TestCase {
input: "und-u-cu-gbp",
success: false,
},
TestCase {
input: "en-ZA-u-sd-zaa",
success: true,
},
TestCase {
input: "en...",
success: false,
},
] {
let data_locale = match (DataLocale::from_str(cas.input), cas.success) {
(Ok(l), true) => l,
(Err(_), false) => {
continue;
}
(Ok(_), false) => {
panic!("DataLocale parsed but it was supposed to fail: {cas:?}");
}
(Err(_), true) => {
panic!("DataLocale was supposed to parse but it failed: {cas:?}");
}
};
writeable::assert_writeable_eq!(data_locale, cas.input);
}
}

22
vendor/icu_locale_core/src/databake.rs vendored Normal file
View File

@@ -0,0 +1,22 @@
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
use crate::LanguageIdentifier;
use databake::*;
impl Bake for LanguageIdentifier {
fn bake(&self, env: &CrateEnv) -> TokenStream {
env.insert("icu_locale_core");
let repr = self.to_string();
if self.variants.len() <= 1 {
quote! {
icu_locale_core::langid!(#repr)
}
} else {
quote! {
icu_locale_core::LanguageIdentifier::try_from_str(#repr).unwrap()
}
}
}
}

View File

@@ -0,0 +1,399 @@
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
//! Unicode Extensions provide a mechanism to extend the [`LanguageIdentifier`] with
//! additional bits of information - a combination of a [`LanguageIdentifier`] and [`Extensions`]
//! is called [`Locale`].
//!
//! There are four types of extensions:
//!
//! * [`Unicode Extensions`] - marked as `u`.
//! * [`Transform Extensions`] - marked as `t`.
//! * [`Private Use Extensions`] - marked as `x`.
//! * [`Other Extensions`] - marked as any `a-z` except of `u`, `t` and `x`.
//!
//! One can think of extensions as a bag of extra information on top of basic 4 [`subtags`].
//!
//! Notice: `Other` extension type is currently not supported.
//!
//! # Examples
//!
//! ```
//! use icu::locale::extensions::unicode::{Key, Value};
//! use icu::locale::Locale;
//!
//! let loc: Locale = "en-US-u-ca-buddhist-t-en-us-h0-hybrid-x-foo"
//! .parse()
//! .expect("Failed to parse.");
//!
//! assert_eq!(loc.id.language, "en".parse().unwrap());
//! assert_eq!(loc.id.script, None);
//! assert_eq!(loc.id.region, Some("US".parse().unwrap()));
//! assert_eq!(loc.id.variants.len(), 0);
//!
//! let key: Key = "ca".parse().expect("Parsing key failed.");
//! let value: Value = "buddhist".parse().expect("Parsing value failed.");
//! assert_eq!(loc.extensions.unicode.keywords.get(&key), Some(&value));
//! ```
//!
//! # Syntactic vs Semantic Extension Handling
//!
//! This module is useful when you need to work with Locale extensions at a syntactic level,
//! perhaps for parsing or generating locale identifiers that include any syntactically valid
//! extensions.
//! For handling and validating known CLDR values with semantic meaning, see the
//! [`crate::preferences::extensions`] module.
//!
//! [`LanguageIdentifier`]: super::LanguageIdentifier
//! [`Locale`]: super::Locale
//! [`subtags`]: super::subtags
//! [`Other Extensions`]: other
//! [`Private Use Extensions`]: private
//! [`Transform Extensions`]: transform
//! [`Unicode Extensions`]: unicode
pub mod other;
pub mod private;
pub mod transform;
pub mod unicode;
use core::cmp::Ordering;
use other::Other;
use private::{Private, PRIVATE_EXT_CHAR};
use transform::{Transform, TRANSFORM_EXT_CHAR};
use unicode::{Unicode, UNICODE_EXT_CHAR};
#[cfg(feature = "alloc")]
use alloc::vec::Vec;
use crate::parser::ParseError;
#[cfg(feature = "alloc")]
use crate::parser::SubtagIterator;
use crate::subtags;
/// Defines the type of extension.
#[derive(Debug, PartialEq, Eq, Clone, Hash, PartialOrd, Ord, Copy)]
#[non_exhaustive]
pub enum ExtensionType {
/// Transform Extension Type marked as `t`.
Transform,
/// Unicode Extension Type marked as `u`.
Unicode,
/// Private Extension Type marked as `x`.
Private,
/// All other extension types.
Other(u8),
}
impl ExtensionType {
#[allow(dead_code)]
pub(crate) const fn try_from_byte_slice(key: &[u8]) -> Result<Self, ParseError> {
if let [b] = key {
Self::try_from_byte(*b)
} else {
Err(ParseError::InvalidExtension)
}
}
pub(crate) const fn try_from_byte(key: u8) -> Result<Self, ParseError> {
let key = key.to_ascii_lowercase();
match key as char {
UNICODE_EXT_CHAR => Ok(Self::Unicode),
TRANSFORM_EXT_CHAR => Ok(Self::Transform),
PRIVATE_EXT_CHAR => Ok(Self::Private),
'a'..='z' => Ok(Self::Other(key)),
_ => Err(ParseError::InvalidExtension),
}
}
pub(crate) const fn try_from_utf8(code_units: &[u8]) -> Result<Self, ParseError> {
let &[first] = code_units else {
return Err(ParseError::InvalidExtension);
};
Self::try_from_byte(first)
}
}
/// A map of extensions associated with a given [`Locale`](crate::Locale).
#[derive(Debug, Default, PartialEq, Eq, Clone, Hash)]
#[non_exhaustive]
pub struct Extensions {
/// A representation of the data for a Unicode extension, when present in the locale identifier.
pub unicode: Unicode,
/// A representation of the data for a transform extension, when present in the locale identifier.
pub transform: Transform,
/// A representation of the data for a private-use extension, when present in the locale identifier.
pub private: Private,
/// A sequence of any other extensions that are present in the locale identifier but are not formally
/// [defined](https://unicode.org/reports/tr35/) and represented explicitly as [`Unicode`], [`Transform`],
/// and [`Private`] are.
#[cfg(feature = "alloc")]
pub other: Vec<Other>,
/// A sequence of any other extensions that are present in the locale identifier but are not formally
/// [defined](https://unicode.org/reports/tr35/) and represented explicitly as [`Unicode`], [`Transform`],
/// and [`Private`] are.
#[cfg(not(feature = "alloc"))]
pub other: &'static [Other],
}
impl Extensions {
/// Returns a new empty map of extensions. Same as [`default()`](Default::default()), but is `const`.
///
/// # Examples
///
/// ```
/// use icu::locale::extensions::Extensions;
///
/// assert_eq!(Extensions::new(), Extensions::default());
/// ```
#[inline]
pub const fn new() -> Self {
Self {
unicode: Unicode::new(),
transform: Transform::new(),
private: Private::new(),
#[cfg(feature = "alloc")]
other: Vec::new(),
#[cfg(not(feature = "alloc"))]
other: &[],
}
}
/// Function to create a new map of extensions containing exactly one unicode extension, callable in `const`
/// context.
#[inline]
pub const fn from_unicode(unicode: Unicode) -> Self {
Self {
unicode,
transform: Transform::new(),
private: Private::new(),
#[cfg(feature = "alloc")]
other: Vec::new(),
#[cfg(not(feature = "alloc"))]
other: &[],
}
}
/// Returns whether there are no extensions present.
///
/// # Examples
///
/// ```
/// use icu::locale::Locale;
///
/// let loc: Locale = "en-US-u-foo".parse().expect("Parsing failed.");
///
/// assert!(!loc.extensions.is_empty());
/// ```
pub fn is_empty(&self) -> bool {
self.unicode.is_empty()
&& self.transform.is_empty()
&& self.private.is_empty()
&& self.other.is_empty()
}
#[expect(clippy::type_complexity)]
pub(crate) fn as_tuple(
&self,
) -> (
(&unicode::Attributes, &unicode::Keywords),
(
Option<(
subtags::Language,
Option<subtags::Script>,
Option<subtags::Region>,
&subtags::Variants,
)>,
&transform::Fields,
),
&private::Private,
&[other::Other],
) {
(
self.unicode.as_tuple(),
self.transform.as_tuple(),
&self.private,
&self.other,
)
}
/// Returns an ordering suitable for use in [`BTreeSet`].
///
/// The ordering may or may not be equivalent to string ordering, and it
/// may or may not be stable across ICU4X releases.
///
/// [`BTreeSet`]: alloc::collections::BTreeSet
pub fn total_cmp(&self, other: &Self) -> Ordering {
self.as_tuple().cmp(&other.as_tuple())
}
/// Retains the specified extension types, clearing all others.
///
/// ✨ *Enabled with the `alloc` Cargo feature.*
///
/// # Examples
///
/// ```
/// use icu::locale::extensions::ExtensionType;
/// use icu::locale::Locale;
///
/// let loc: Locale =
/// "und-a-hello-t-mul-u-world-z-zzz-x-extra".parse().unwrap();
///
/// let mut only_unicode = loc.clone();
/// only_unicode
/// .extensions
/// .retain_by_type(|t| t == ExtensionType::Unicode);
/// assert_eq!(only_unicode, "und-u-world".parse().unwrap());
///
/// let mut only_t_z = loc.clone();
/// only_t_z.extensions.retain_by_type(|t| {
/// t == ExtensionType::Transform || t == ExtensionType::Other(b'z')
/// });
/// assert_eq!(only_t_z, "und-t-mul-z-zzz".parse().unwrap());
/// ```
#[cfg(feature = "alloc")]
pub fn retain_by_type<F>(&mut self, mut predicate: F)
where
F: FnMut(ExtensionType) -> bool,
{
if !predicate(ExtensionType::Unicode) {
self.unicode.clear();
}
if !predicate(ExtensionType::Transform) {
self.transform.clear();
}
if !predicate(ExtensionType::Private) {
self.private.clear();
}
#[cfg(feature = "alloc")]
self.other
.retain(|o| predicate(ExtensionType::Other(o.get_ext_byte())));
}
#[cfg(feature = "alloc")]
pub(crate) fn try_from_iter(iter: &mut SubtagIterator) -> Result<Self, ParseError> {
let mut unicode = None;
let mut transform = None;
let mut private = None;
let mut other = Vec::new();
while let Some(subtag) = iter.next() {
if subtag.is_empty() {
return Err(ParseError::InvalidExtension);
}
let &[subtag] = subtag else {
return Err(ParseError::InvalidExtension);
};
match ExtensionType::try_from_byte(subtag) {
Ok(ExtensionType::Unicode) => {
if unicode.is_some() {
return Err(ParseError::DuplicatedExtension);
}
unicode = Some(Unicode::try_from_iter(iter)?);
}
Ok(ExtensionType::Transform) => {
if transform.is_some() {
return Err(ParseError::DuplicatedExtension);
}
transform = Some(Transform::try_from_iter(iter)?);
}
Ok(ExtensionType::Private) => {
if private.is_some() {
return Err(ParseError::DuplicatedExtension);
}
private = Some(Private::try_from_iter(iter)?);
}
Ok(ExtensionType::Other(ext)) => {
if other.iter().any(|o: &Other| o.get_ext_byte() == ext) {
return Err(ParseError::DuplicatedExtension);
}
let parsed = Other::try_from_iter(ext, iter)?;
if let Err(idx) = other.binary_search(&parsed) {
other.insert(idx, parsed);
} else {
return Err(ParseError::InvalidExtension);
}
}
_ => return Err(ParseError::InvalidExtension),
}
}
Ok(Self {
unicode: unicode.unwrap_or_default(),
transform: transform.unwrap_or_default(),
private: private.unwrap_or_default(),
other,
})
}
pub(crate) fn for_each_subtag_str<E, F>(&self, f: &mut F) -> Result<(), E>
where
F: FnMut(&str) -> Result<(), E>,
{
let mut wrote_tu = false;
// Alphabetic by singleton
self.other.iter().try_for_each(|other| {
if other.get_ext() > TRANSFORM_EXT_CHAR && !wrote_tu {
// Since 't' and 'u' are next to each other in alphabetical
// order, write both now.
self.transform.for_each_subtag_str(f, true)?;
self.unicode.for_each_subtag_str(f, true)?;
wrote_tu = true;
}
other.for_each_subtag_str(f, true)?;
Ok(())
})?;
if !wrote_tu {
self.transform.for_each_subtag_str(f, true)?;
self.unicode.for_each_subtag_str(f, true)?;
}
// Private must be written last, since it allows single character
// keys. Extensions must also be written in alphabetical order,
// which would seem to imply that other extensions `y` and `z` are
// invalid, but this is not specified.
self.private.for_each_subtag_str(f, true)?;
Ok(())
}
}
impl_writeable_for_each_subtag_str_no_test!(Extensions);
#[test]
fn test_writeable() {
use crate::Locale;
use writeable::assert_writeable_eq;
assert_writeable_eq!(Extensions::new(), "");
assert_writeable_eq!(
"my-t-my-d0-zawgyi".parse::<Locale>().unwrap().extensions,
"t-my-d0-zawgyi",
);
assert_writeable_eq!(
"ar-SA-u-ca-islamic-civil"
.parse::<Locale>()
.unwrap()
.extensions,
"u-ca-islamic-civil",
);
assert_writeable_eq!(
"en-001-x-foo-bar".parse::<Locale>().unwrap().extensions,
"x-foo-bar",
);
assert_writeable_eq!(
"und-t-m0-true".parse::<Locale>().unwrap().extensions,
"t-m0-true",
);
assert_writeable_eq!(
"und-a-foo-t-foo-u-foo-w-foo-z-foo-x-foo"
.parse::<Locale>()
.unwrap()
.extensions,
"a-foo-t-foo-u-foo-w-foo-z-foo-x-foo",
);
}

View File

@@ -0,0 +1,260 @@
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
//! Other Use Extensions is a list of extensions other than unicode,
//! transform or private.
//!
//! Those extensions are treated as a pass-through, and no Unicode related
//! behavior depends on them.
//!
//! The main struct for this extension is [`Other`] which is a list of [`Subtag`]s.
//!
//! # Examples
//!
//! ```
//! use icu::locale::extensions::other::Other;
//! use icu::locale::Locale;
//!
//! let mut loc: Locale = "en-US-a-foo-faa".parse().expect("Parsing failed.");
//! ```
#[cfg(feature = "alloc")]
use core::str::FromStr;
#[cfg(feature = "alloc")]
use super::ExtensionType;
#[cfg(feature = "alloc")]
use crate::parser::ParseError;
#[cfg(feature = "alloc")]
use crate::parser::SubtagIterator;
use crate::shortvec::ShortBoxSlice;
use crate::subtags::Subtag;
#[cfg(feature = "alloc")]
use alloc::vec::Vec;
/// A list of [`Other Use Extensions`] as defined in [`Unicode Locale
/// Identifier`] specification.
///
/// Those extensions are treated as a pass-through, and no Unicode related
/// behavior depends on them.
///
/// # Examples
///
/// ```
/// use icu::locale::extensions::other::Other;
/// use icu::locale::subtags::Subtag;
///
/// let subtag1: Subtag = "foo".parse().expect("Failed to parse a Subtag.");
/// let subtag2: Subtag = "bar".parse().expect("Failed to parse a Subtag.");
///
/// let other = Other::from_vec_unchecked(b'a', vec![subtag1, subtag2]);
/// assert_eq!(&other.to_string(), "a-foo-bar");
/// ```
///
/// [`Other Use Extensions`]: https://unicode.org/reports/tr35/#other_extensions
/// [`Unicode Locale Identifier`]: https://unicode.org/reports/tr35/#Unicode_locale_identifier
#[derive(Clone, PartialEq, Eq, Debug, Default, Hash, PartialOrd, Ord)]
pub struct Other {
// Safety invariant: must be ASCII
ext: u8,
keys: ShortBoxSlice<Subtag>,
}
impl Other {
/// A constructor which takes a str slice, parses it and
/// produces a well-formed [`Other`].
///
/// ✨ *Enabled with the `alloc` Cargo feature.*
#[inline]
#[cfg(feature = "alloc")]
pub fn try_from_str(s: &str) -> Result<Self, ParseError> {
Self::try_from_utf8(s.as_bytes())
}
/// See [`Self::try_from_str`]
///
/// ✨ *Enabled with the `alloc` Cargo feature.*
#[cfg(feature = "alloc")]
pub fn try_from_utf8(code_units: &[u8]) -> Result<Self, ParseError> {
let mut iter = SubtagIterator::new(code_units);
let ext = iter.next().ok_or(ParseError::InvalidExtension)?;
if let ExtensionType::Other(b) = ExtensionType::try_from_byte_slice(ext)? {
return Self::try_from_iter(b, &mut iter);
}
Err(ParseError::InvalidExtension)
}
/// A constructor which takes a pre-sorted list of [`Subtag`].
///
/// ✨ *Enabled with the `alloc` Cargo feature.*
///
/// # Panics
///
/// Panics if `ext` is not ASCII alphabetic.
///
/// # Examples
///
/// ```
/// use icu::locale::extensions::other::Other;
/// use icu::locale::subtags::Subtag;
///
/// let subtag1: Subtag = "foo".parse().expect("Failed to parse a Subtag.");
/// let subtag2: Subtag = "bar".parse().expect("Failed to parse a Subtag.");
///
/// let other = Other::from_vec_unchecked(b'a', vec![subtag1, subtag2]);
/// assert_eq!(&other.to_string(), "a-foo-bar");
/// ```
#[cfg(feature = "alloc")]
pub fn from_vec_unchecked(ext: u8, keys: Vec<Subtag>) -> Self {
Self::from_short_slice_unchecked(ext, keys.into())
}
#[allow(dead_code)]
pub(crate) fn from_short_slice_unchecked(ext: u8, keys: ShortBoxSlice<Subtag>) -> Self {
assert!(ext.is_ascii_alphabetic());
// Safety invariant upheld here: ext checked as ASCII above
Self { ext, keys }
}
#[cfg(feature = "alloc")]
pub(crate) fn try_from_iter(ext: u8, iter: &mut SubtagIterator) -> Result<Self, ParseError> {
debug_assert!(matches!(
ExtensionType::try_from_byte(ext),
Ok(ExtensionType::Other(_)),
));
let mut keys = ShortBoxSlice::new();
while let Some(subtag) = iter.peek() {
if !Subtag::valid_key(subtag) {
break;
}
if let Ok(key) = Subtag::try_from_utf8(subtag) {
keys.push(key);
}
iter.next();
}
if keys.is_empty() {
Err(ParseError::InvalidExtension)
} else {
Ok(Self::from_short_slice_unchecked(ext, keys))
}
}
/// Gets the tag character for this extension as a &str.
///
/// # Examples
///
/// ```
/// use icu::locale::Locale;
///
/// let loc: Locale = "und-a-hello-world".parse().unwrap();
/// let other_ext = &loc.extensions.other[0];
/// assert_eq!(other_ext.get_ext_str(), "a");
/// ```
pub fn get_ext_str(&self) -> &str {
debug_assert!(self.ext.is_ascii_alphabetic());
// Safety: from safety invariant on self.ext (that it is ASCII)
unsafe { core::str::from_utf8_unchecked(core::slice::from_ref(&self.ext)) }
}
/// Gets the tag character for this extension as a char.
///
/// # Examples
///
/// ```
/// use icu::locale::Locale;
///
/// let loc: Locale = "und-a-hello-world".parse().unwrap();
/// let other_ext = &loc.extensions.other[0];
/// assert_eq!(other_ext.get_ext(), 'a');
/// ```
pub fn get_ext(&self) -> char {
self.ext as char
}
/// Gets the tag character for this extension as a byte.
///
/// # Examples
///
/// ```
/// use icu::locale::Locale;
///
/// let loc: Locale = "und-a-hello-world".parse().unwrap();
/// let other_ext = &loc.extensions.other[0];
/// assert_eq!(other_ext.get_ext_byte(), b'a');
/// ```
pub fn get_ext_byte(&self) -> u8 {
self.ext
}
pub(crate) fn for_each_subtag_str<E, F>(&self, f: &mut F, with_ext: bool) -> Result<(), E>
where
F: FnMut(&str) -> Result<(), E>,
{
if self.keys.is_empty() {
return Ok(());
}
if with_ext {
f(self.get_ext_str())?;
}
self.keys.iter().map(|t| t.as_str()).try_for_each(f)
}
}
/// ✨ *Enabled with the `alloc` Cargo feature.*
#[cfg(feature = "alloc")]
impl FromStr for Other {
type Err = ParseError;
#[inline]
fn from_str(s: &str) -> Result<Self, Self::Err> {
Self::try_from_str(s)
}
}
writeable::impl_display_with_writeable!(Other, #[cfg(feature = "alloc")]);
impl writeable::Writeable for Other {
fn write_to<W: core::fmt::Write + ?Sized>(&self, sink: &mut W) -> core::fmt::Result {
if self.keys.is_empty() {
return Ok(());
}
sink.write_str(self.get_ext_str())?;
for key in self.keys.iter() {
sink.write_char('-')?;
writeable::Writeable::write_to(key, sink)?;
}
Ok(())
}
fn writeable_length_hint(&self) -> writeable::LengthHint {
if self.keys.is_empty() {
return writeable::LengthHint::exact(0);
};
let mut result = writeable::LengthHint::exact(1);
for key in self.keys.iter() {
result += writeable::Writeable::writeable_length_hint(key) + 1;
}
result
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_other_extension_fromstr() {
let oe: Other = "o-foo-bar".parse().expect("Failed to parse Other");
assert_eq!(oe.to_string(), "o-foo-bar");
let oe: Result<Other, _> = "o".parse();
assert!(oe.is_err());
}
}

View File

@@ -0,0 +1,257 @@
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
//! Private Use Extensions is a list of extensions intended for
//! private use.
//!
//! Those extensions are treated as a pass-through, and no Unicode related
//! behavior depends on them.
//!
//! The main struct for this extension is [`Private`] which is a list of [`Subtag`]s.
//!
//! # Examples
//!
//! ```
//! use icu::locale::extensions::private::subtag;
//! use icu::locale::{locale, Locale};
//!
//! let mut loc: Locale = "en-US-x-foo-faa".parse().expect("Parsing failed.");
//!
//! assert!(loc.extensions.private.contains(&subtag!("foo")));
//! assert_eq!(loc.extensions.private.iter().next(), Some(&subtag!("foo")));
//!
//! loc.extensions.private.clear();
//!
//! assert!(loc.extensions.private.is_empty());
//! assert_eq!(loc, locale!("en-US"));
//! ```
mod other;
#[cfg(feature = "alloc")]
use alloc::vec::Vec;
use core::ops::Deref;
#[cfg(feature = "alloc")]
use core::str::FromStr;
#[doc(inline)]
pub use other::{subtag, Subtag};
#[cfg(feature = "alloc")]
use super::ExtensionType;
#[cfg(feature = "alloc")]
use crate::parser::ParseError;
#[cfg(feature = "alloc")]
use crate::parser::SubtagIterator;
use crate::shortvec::ShortBoxSlice;
pub(crate) const PRIVATE_EXT_CHAR: char = 'x';
pub(crate) const PRIVATE_EXT_STR: &str = "x";
/// A list of [`Private Use Extensions`] as defined in [`Unicode Locale
/// Identifier`] specification.
///
/// Those extensions are treated as a pass-through, and no Unicode related
/// behavior depends on them.
///
/// # Examples
///
/// ```
/// use icu::locale::extensions::private::{Private, Subtag};
///
/// let subtag1: Subtag = "foo".parse().expect("Failed to parse a Subtag.");
/// let subtag2: Subtag = "bar".parse().expect("Failed to parse a Subtag.");
///
/// let private = Private::from_vec_unchecked(vec![subtag1, subtag2]);
/// assert_eq!(&private.to_string(), "x-foo-bar");
/// ```
///
/// [`Private Use Extensions`]: https://unicode.org/reports/tr35/#pu_extensions
/// [`Unicode Locale Identifier`]: https://unicode.org/reports/tr35/#Unicode_locale_identifier
#[derive(Clone, PartialEq, Eq, Debug, Default, Hash, PartialOrd, Ord)]
pub struct Private(ShortBoxSlice<Subtag>);
impl Private {
/// Returns a new empty list of private-use extensions. Same as [`default()`](Default::default()), but is `const`.
///
/// # Examples
///
/// ```
/// use icu::locale::extensions::private::Private;
///
/// assert_eq!(Private::new(), Private::default());
/// ```
#[inline]
pub const fn new() -> Self {
Self(ShortBoxSlice::new())
}
/// A constructor which takes a str slice, parses it and
/// produces a well-formed [`Private`].
///
/// ✨ *Enabled with the `alloc` Cargo feature.*
#[inline]
#[cfg(feature = "alloc")]
pub fn try_from_str(s: &str) -> Result<Self, ParseError> {
Self::try_from_utf8(s.as_bytes())
}
/// See [`Self::try_from_str`]
///
/// ✨ *Enabled with the `alloc` Cargo feature.*
#[cfg(feature = "alloc")]
pub fn try_from_utf8(code_units: &[u8]) -> Result<Self, ParseError> {
let mut iter = SubtagIterator::new(code_units);
let ext = iter.next().ok_or(ParseError::InvalidExtension)?;
if let ExtensionType::Private = ExtensionType::try_from_byte_slice(ext)? {
return Self::try_from_iter(&mut iter);
}
Err(ParseError::InvalidExtension)
}
/// A constructor which takes a pre-sorted list of [`Subtag`].
///
/// ✨ *Enabled with the `alloc` Cargo feature.*
///
/// # Examples
///
/// ```
/// use icu::locale::extensions::private::{Private, Subtag};
///
/// let subtag1: Subtag = "foo".parse().expect("Failed to parse a Subtag.");
/// let subtag2: Subtag = "bar".parse().expect("Failed to parse a Subtag.");
///
/// let private = Private::from_vec_unchecked(vec![subtag1, subtag2]);
/// assert_eq!(&private.to_string(), "x-foo-bar");
/// ```
#[cfg(feature = "alloc")]
pub fn from_vec_unchecked(input: Vec<Subtag>) -> Self {
Self(input.into())
}
/// A constructor which takes a single [`Subtag`].
///
/// # Examples
///
/// ```
/// use icu::locale::extensions::private::{Private, Subtag};
///
/// let subtag: Subtag = "foo".parse().expect("Failed to parse a Subtag.");
///
/// let private = Private::new_single(subtag);
/// assert_eq!(&private.to_string(), "x-foo");
/// ```
pub const fn new_single(input: Subtag) -> Self {
Self(ShortBoxSlice::new_single(input))
}
/// Empties the [`Private`] list.
///
/// # Examples
///
/// ```
/// use icu::locale::extensions::private::{Private, Subtag};
///
/// let subtag1: Subtag = "foo".parse().expect("Failed to parse a Subtag.");
/// let subtag2: Subtag = "bar".parse().expect("Failed to parse a Subtag.");
/// let mut private = Private::from_vec_unchecked(vec![subtag1, subtag2]);
///
/// assert_eq!(&private.to_string(), "x-foo-bar");
///
/// private.clear();
///
/// assert_eq!(private, Private::new());
/// ```
pub fn clear(&mut self) {
self.0.clear();
}
#[cfg(feature = "alloc")]
pub(crate) fn try_from_iter(iter: &mut SubtagIterator) -> Result<Self, ParseError> {
let keys = iter
.map(Subtag::try_from_utf8)
.collect::<Result<ShortBoxSlice<_>, _>>()?;
if keys.is_empty() {
Err(ParseError::InvalidExtension)
} else {
Ok(Self(keys))
}
}
pub(crate) fn for_each_subtag_str<E, F>(&self, f: &mut F, with_ext: bool) -> Result<(), E>
where
F: FnMut(&str) -> Result<(), E>,
{
if self.is_empty() {
return Ok(());
}
if with_ext {
f(PRIVATE_EXT_STR)?;
}
self.deref().iter().map(|t| t.as_str()).try_for_each(f)
}
}
/// ✨ *Enabled with the `alloc` Cargo feature.*
#[cfg(feature = "alloc")]
impl FromStr for Private {
type Err = ParseError;
#[inline]
fn from_str(s: &str) -> Result<Self, Self::Err> {
Self::try_from_str(s)
}
}
writeable::impl_display_with_writeable!(Private, #[cfg(feature = "alloc")]);
impl writeable::Writeable for Private {
fn write_to<W: core::fmt::Write + ?Sized>(&self, sink: &mut W) -> core::fmt::Result {
if self.is_empty() {
return Ok(());
}
sink.write_char(PRIVATE_EXT_CHAR)?;
for key in self.iter() {
sink.write_char('-')?;
writeable::Writeable::write_to(key, sink)?;
}
Ok(())
}
fn writeable_length_hint(&self) -> writeable::LengthHint {
if self.is_empty() {
return writeable::LengthHint::exact(0);
}
let mut result = writeable::LengthHint::exact(1);
for key in self.iter() {
result += writeable::Writeable::writeable_length_hint(key) + 1;
}
result
}
}
impl Deref for Private {
type Target = [Subtag];
fn deref(&self) -> &Self::Target {
self.0.deref()
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_private_extension_fromstr() {
let pe: Private = "x-foo-bar-l-baz".parse().expect("Failed to parse Private");
assert_eq!(pe.to_string(), "x-foo-bar-l-baz");
let pe: Result<Private, _> = "x".parse();
assert!(pe.is_err());
}
}

View File

@@ -0,0 +1,47 @@
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
impl_tinystr_subtag!(
/// A single item used in a list of [`Private`](super::Private) extensions.
///
/// The subtag has to be an ASCII alphanumerical string no shorter than
/// one character and no longer than eight.
///
/// # Examples
///
/// ```
/// use icu::locale::extensions::private::Subtag;
///
/// let subtag1: Subtag = "Foo".parse()
/// .expect("Failed to parse a Subtag.");
///
/// assert_eq!(subtag1.as_str(), "foo");
/// ```
///
/// Notice: This is different from the generic [`Subtag`](crate::subtags::Subtag)
/// which is between two and eight characters.
///
/// ```
/// use icu::locale::extensions::private;
/// use icu::locale::subtags;
///
/// let subtag: Result<private::Subtag, _> = "f".parse();
/// assert!(subtag.is_ok());
///
/// let subtag: Result<subtags::Subtag, _> = "f".parse();
/// assert!(subtag.is_err());
/// ```
Subtag,
extensions::private,
subtag,
extensions_private_subtag,
1..=8,
s,
s.is_ascii_alphanumeric(),
s.to_ascii_lowercase(),
s.is_ascii_alphanumeric() && s.is_ascii_lowercase(),
InvalidExtension,
["foo12"],
["toolooong"],
);

View File

@@ -0,0 +1,234 @@
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
use core::borrow::Borrow;
use litemap::LiteMap;
use super::Key;
use super::Value;
/// A list of [`Key`]-[`Value`] pairs representing functional information
/// about content transformations.
///
/// Here are examples of fields used in Unicode:
/// - `s0`, `d0` - Transform source/destination
/// - `t0` - Machine Translation
/// - `h0` - Hybrid Locale Identifiers
///
/// You can find the full list in [`Unicode BCP 47 T Extension`] section of LDML.
///
/// [`Unicode BCP 47 T Extension`]: https://unicode.org/reports/tr35/tr35.html#BCP47_T_Extension
///
/// # Examples
///
/// ```
/// use icu::locale::extensions::transform::{key, Fields, Value};
///
/// let value = "hybrid".parse::<Value>().expect("Failed to parse a Value.");
/// let fields = [(key!("h0"), value)].into_iter().collect::<Fields>();
///
/// assert_eq!(&fields.to_string(), "h0-hybrid");
/// ```
#[derive(Clone, PartialEq, Eq, Debug, Default, Hash, PartialOrd, Ord)]
pub struct Fields(Inner);
#[cfg(feature = "alloc")]
type Inner = LiteMap<Key, Value>;
#[cfg(not(feature = "alloc"))]
type Inner = LiteMap<Key, Value, &'static [(Key, Value)]>;
impl Fields {
/// Returns a new empty list of key-value pairs. Same as [`default()`](Default::default()), but is `const`.
///
/// # Examples
///
/// ```
/// use icu::locale::extensions::transform::Fields;
///
/// assert_eq!(Fields::new(), Fields::default());
/// ```
#[inline]
pub const fn new() -> Self {
Self(LiteMap::new())
}
/// Returns `true` if there are no fields.
///
/// # Examples
///
/// ```
/// use icu::locale::locale;
/// use icu::locale::Locale;
///
/// let loc1 = Locale::try_from_str("und-t-h0-hybrid").unwrap();
/// let loc2 = locale!("und-u-ca-buddhist");
///
/// assert!(!loc1.extensions.transform.fields.is_empty());
/// assert!(loc2.extensions.transform.fields.is_empty());
/// ```
pub fn is_empty(&self) -> bool {
self.0.is_empty()
}
/// Empties the [`Fields`] list.
///
/// Returns the old list.
///
/// # Examples
///
/// ```
/// use icu::locale::extensions::transform::{key, Fields, Value};
///
/// let value = "hybrid".parse::<Value>().expect("Failed to parse a Value.");
/// let mut fields = [(key!("h0"), value)].into_iter().collect::<Fields>();
///
/// assert_eq!(&fields.to_string(), "h0-hybrid");
///
/// fields.clear();
///
/// assert_eq!(fields, Fields::new());
/// ```
pub fn clear(&mut self) -> Self {
core::mem::take(self)
}
/// Returns `true` if the list contains a [`Value`] for the specified [`Key`].
///
///
/// # Examples
///
/// ```
/// use icu::locale::extensions::transform::{Fields, Key, Value};
///
/// let key: Key = "h0".parse().expect("Failed to parse a Key.");
/// let value: Value = "hybrid".parse().expect("Failed to parse a Value.");
/// let mut fields = [(key, value)].into_iter().collect::<Fields>();
///
/// let key: Key = "h0".parse().expect("Failed to parse a Key.");
/// assert!(&fields.contains_key(&key));
/// ```
pub fn contains_key<Q>(&self, key: &Q) -> bool
where
Key: Borrow<Q>,
Q: Ord,
{
self.0.contains_key(key)
}
/// Returns a reference to the [`Value`] corresponding to the [`Key`].
///
///
/// # Examples
///
/// ```
/// use icu::locale::extensions::transform::{key, Fields, Value};
///
/// let value = "hybrid".parse::<Value>().unwrap();
/// let fields = [(key!("h0"), value.clone())]
/// .into_iter()
/// .collect::<Fields>();
///
/// assert_eq!(fields.get(&key!("h0")), Some(&value));
/// ```
pub fn get<Q>(&self, key: &Q) -> Option<&Value>
where
Key: Borrow<Q>,
Q: Ord,
{
self.0.get(key)
}
/// Sets the specified keyword, returning the old value if it already existed.
///
/// ✨ *Enabled with the `alloc` Cargo feature.*
///
/// # Examples
///
/// ```
/// use icu::locale::extensions::transform::{key, Value};
/// use icu::locale::Locale;
///
/// let lower = "lower".parse::<Value>().expect("valid extension subtag");
/// let casefold = "casefold".parse::<Value>().expect("valid extension subtag");
///
/// let mut loc: Locale = "en-t-hi-d0-casefold"
/// .parse()
/// .expect("valid BCP-47 identifier");
/// let old_value = loc.extensions.transform.fields.set(key!("d0"), lower);
///
/// assert_eq!(old_value, Some(casefold));
/// assert_eq!(loc, "en-t-hi-d0-lower".parse().unwrap());
/// ```
#[cfg(feature = "alloc")]
pub fn set(&mut self, key: Key, value: Value) -> Option<Value> {
self.0.insert(key, value)
}
/// Retains a subset of fields as specified by the predicate function.
///
/// ✨ *Enabled with the `alloc` Cargo feature.*
///
/// # Examples
///
/// ```
/// use icu::locale::extensions::transform::key;
/// use icu::locale::Locale;
///
/// let mut loc: Locale = "und-t-h0-hybrid-d0-hex-m0-xml".parse().unwrap();
///
/// loc.extensions
/// .transform
/// .fields
/// .retain_by_key(|&k| k == key!("h0"));
/// assert_eq!(loc, "und-t-h0-hybrid".parse().unwrap());
///
/// loc.extensions
/// .transform
/// .fields
/// .retain_by_key(|&k| k == key!("d0"));
/// assert_eq!(loc, Locale::UNKNOWN);
/// ```
#[cfg(feature = "alloc")]
pub fn retain_by_key<F>(&mut self, mut predicate: F)
where
F: FnMut(&Key) -> bool,
{
self.0.retain(|k, _| predicate(k))
}
pub(crate) fn for_each_subtag_str<E, F>(&self, f: &mut F) -> Result<(), E>
where
F: FnMut(&str) -> Result<(), E>,
{
for (k, v) in self.0.iter() {
f(k.as_str())?;
v.for_each_subtag_str(f)?;
}
Ok(())
}
/// This needs to be its own method to help with type inference in helpers.rs
#[cfg(test)]
pub(crate) fn from_tuple_vec(v: Vec<(Key, Value)>) -> Self {
v.into_iter().collect()
}
}
/// ✨ *Enabled with the `alloc` Cargo feature.*
#[cfg(feature = "alloc")]
impl From<LiteMap<Key, Value>> for Fields {
fn from(map: LiteMap<Key, Value>) -> Self {
Self(map)
}
}
/// ✨ *Enabled with the `alloc` Cargo feature.*
#[cfg(feature = "alloc")]
impl core::iter::FromIterator<(Key, Value)> for Fields {
fn from_iter<I: IntoIterator<Item = (Key, Value)>>(iter: I) -> Self {
LiteMap::from_iter(iter).into()
}
}
impl_writeable_for_key_value!(Fields, "h0", "hybrid", "m0", "m0-true");

View File

@@ -0,0 +1,32 @@
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
impl_tinystr_subtag!(
/// A key used in a list of [`Fields`](super::Fields).
///
/// The key has to be a two ASCII characters long, with the first
/// character being alphabetic, and the second being a number.
///
/// # Examples
///
/// ```
/// use icu::locale::extensions::transform::Key;
///
/// let key1: Key = "k0".parse().expect("Failed to parse a Key.");
///
/// assert_eq!(key1.as_str(), "k0");
/// ```
Key,
extensions::transform,
key,
extensions_transform_key,
2..=2,
s,
s.all_bytes()[0].is_ascii_alphabetic() && s.all_bytes()[1].is_ascii_digit(),
s.to_ascii_lowercase(),
s.all_bytes()[0].is_ascii_lowercase() && s.all_bytes()[1].is_ascii_digit(),
InvalidExtension,
["k0"],
["", "k", "0k", "k12"],
);

View File

@@ -0,0 +1,336 @@
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
//! Transform Extensions provide information on content transformations in a given locale.
//!
//! The main struct for this extension is [`Transform`] which contains [`Fields`] and an
//! optional [`LanguageIdentifier`].
//!
//! [`LanguageIdentifier`]: super::super::LanguageIdentifier
//!
//! # Examples
//!
//! ```
//! use icu::locale::extensions::transform::{Fields, Key, Transform, Value};
//! use icu::locale::{LanguageIdentifier, Locale};
//!
//! let mut loc: Locale =
//! "en-US-t-es-ar-h0-hybrid".parse().expect("Parsing failed.");
//!
//! let lang: LanguageIdentifier =
//! "es-AR".parse().expect("Parsing LanguageIdentifier failed.");
//!
//! let key: Key = "h0".parse().expect("Parsing key failed.");
//! let value: Value = "hybrid".parse().expect("Parsing value failed.");
//!
//! assert_eq!(loc.extensions.transform.lang, Some(lang));
//! assert!(loc.extensions.transform.fields.contains_key(&key));
//! assert_eq!(loc.extensions.transform.fields.get(&key), Some(&value));
//!
//! assert_eq!(&loc.extensions.transform.to_string(), "t-es-ar-h0-hybrid");
//! ```
mod fields;
mod key;
mod value;
use core::cmp::Ordering;
#[cfg(feature = "alloc")]
use core::str::FromStr;
pub use fields::Fields;
#[doc(inline)]
pub use key::{key, Key};
pub use value::Value;
#[cfg(feature = "alloc")]
use super::ExtensionType;
#[cfg(feature = "alloc")]
use crate::parser::SubtagIterator;
#[cfg(feature = "alloc")]
use crate::parser::{parse_language_identifier_from_iter, ParseError, ParserMode};
#[cfg(feature = "alloc")]
use crate::shortvec::ShortBoxSlice;
use crate::subtags;
#[cfg(feature = "alloc")]
use crate::subtags::Language;
use crate::LanguageIdentifier;
#[cfg(feature = "alloc")]
use litemap::LiteMap;
pub(crate) const TRANSFORM_EXT_CHAR: char = 't';
pub(crate) const TRANSFORM_EXT_STR: &str = "t";
/// A list of [`Unicode BCP47 T Extensions`] as defined in [`Unicode Locale
/// Identifier`] specification.
///
/// Transform extension carries information about source language or script of
/// transformed content, including content that has been transliterated, transcribed,
/// or translated, or in some other way influenced by the source (See [`RFC 6497`] for details).
///
/// # Examples
///
/// ```
/// use icu::locale::extensions::transform::{Key, Value};
/// use icu::locale::{LanguageIdentifier, Locale};
///
/// let mut loc: Locale =
/// "de-t-en-us-h0-hybrid".parse().expect("Parsing failed.");
///
/// let en_us: LanguageIdentifier = "en-US".parse().expect("Parsing failed.");
///
/// assert_eq!(loc.extensions.transform.lang, Some(en_us));
/// let key: Key = "h0".parse().expect("Parsing key failed.");
/// let value: Value = "hybrid".parse().expect("Parsing value failed.");
/// assert_eq!(loc.extensions.transform.fields.get(&key), Some(&value));
/// ```
/// [`Unicode BCP47 T Extensions`]: https://unicode.org/reports/tr35/#t_Extension
/// [`RFC 6497`]: https://www.ietf.org/rfc/rfc6497.txt
/// [`Unicode Locale Identifier`]: https://unicode.org/reports/tr35/#Unicode_locale_identifier
#[derive(Clone, PartialEq, Eq, Debug, Default, Hash)]
#[allow(clippy::exhaustive_structs)] // spec-backed stable datastructure
pub struct Transform {
/// The [`LanguageIdentifier`] specified with this locale extension, or `None` if not present.
pub lang: Option<LanguageIdentifier>,
/// The key-value pairs present in this locale extension, with each extension key subtag
/// associated to its provided value subtag.
pub fields: Fields,
}
impl Transform {
/// Returns a new empty map of Transform extensions. Same as [`default()`](Default::default()), but is `const`.
///
/// # Examples
///
/// ```
/// use icu::locale::extensions::transform::Transform;
///
/// assert_eq!(Transform::new(), Transform::default());
/// ```
#[inline]
pub const fn new() -> Self {
Self {
lang: None,
fields: Fields::new(),
}
}
/// A constructor which takes a str slice, parses it and
/// produces a well-formed [`Transform`].
///
/// ✨ *Enabled with the `alloc` Cargo feature.*
#[inline]
#[cfg(feature = "alloc")]
pub fn try_from_str(s: &str) -> Result<Self, ParseError> {
Self::try_from_utf8(s.as_bytes())
}
/// See [`Self::try_from_str`]
///
/// ✨ *Enabled with the `alloc` Cargo feature.*
#[cfg(feature = "alloc")]
pub fn try_from_utf8(code_units: &[u8]) -> Result<Self, ParseError> {
let mut iter = SubtagIterator::new(code_units);
let ext = iter.next().ok_or(ParseError::InvalidExtension)?;
if let ExtensionType::Transform = ExtensionType::try_from_byte_slice(ext)? {
return Self::try_from_iter(&mut iter);
}
Err(ParseError::InvalidExtension)
}
/// Returns `true` if there are no tfields and no tlang in the `TransformExtensionList`.
///
/// # Examples
///
/// ```
/// use icu::locale::Locale;
///
/// let mut loc: Locale = "en-US-t-es-ar".parse().expect("Parsing failed.");
///
/// assert!(!loc.extensions.transform.is_empty());
/// ```
pub fn is_empty(&self) -> bool {
self.lang.is_none() && self.fields.is_empty()
}
/// Clears the transform extension, effectively removing it from the locale.
///
/// # Examples
///
/// ```
/// use icu::locale::Locale;
///
/// let mut loc: Locale = "en-US-t-es-ar".parse().unwrap();
/// loc.extensions.transform.clear();
/// assert_eq!(loc, "en-US".parse().unwrap());
/// ```
pub fn clear(&mut self) {
self.lang = None;
self.fields.clear();
}
#[expect(clippy::type_complexity)]
pub(crate) fn as_tuple(
&self,
) -> (
Option<(
subtags::Language,
Option<subtags::Script>,
Option<subtags::Region>,
&subtags::Variants,
)>,
&Fields,
) {
(self.lang.as_ref().map(|l| l.as_tuple()), &self.fields)
}
/// Returns an ordering suitable for use in [`BTreeSet`].
///
/// The ordering may or may not be equivalent to string ordering, and it
/// may or may not be stable across ICU4X releases.
///
/// [`BTreeSet`]: alloc::collections::BTreeSet
pub fn total_cmp(&self, other: &Self) -> Ordering {
self.as_tuple().cmp(&other.as_tuple())
}
#[cfg(feature = "alloc")]
pub(crate) fn try_from_iter(iter: &mut SubtagIterator) -> Result<Self, ParseError> {
let mut tlang = None;
let mut tfields = LiteMap::new();
if let Some(subtag) = iter.peek() {
if Language::try_from_utf8(subtag).is_ok() {
tlang = Some(parse_language_identifier_from_iter(
iter,
ParserMode::Partial,
)?);
}
}
let mut current_tkey = None;
let mut current_tvalue = ShortBoxSlice::new();
let mut has_current_tvalue = false;
while let Some(subtag) = iter.peek() {
if let Some(tkey) = current_tkey {
if let Ok(val) = Value::parse_subtag(subtag) {
has_current_tvalue = true;
if let Some(val) = val {
current_tvalue.push(val);
}
} else {
if !has_current_tvalue {
return Err(ParseError::InvalidExtension);
}
tfields.try_insert(tkey, Value::from_short_slice_unchecked(current_tvalue));
current_tkey = None;
current_tvalue = ShortBoxSlice::new();
has_current_tvalue = false;
continue;
}
} else if let Ok(tkey) = Key::try_from_utf8(subtag) {
current_tkey = Some(tkey);
} else {
break;
}
iter.next();
}
if let Some(tkey) = current_tkey {
if !has_current_tvalue {
return Err(ParseError::InvalidExtension);
}
tfields.try_insert(tkey, Value::from_short_slice_unchecked(current_tvalue));
}
if tlang.is_none() && tfields.is_empty() {
Err(ParseError::InvalidExtension)
} else {
Ok(Self {
lang: tlang,
fields: tfields.into(),
})
}
}
pub(crate) fn for_each_subtag_str<E, F>(&self, f: &mut F, with_ext: bool) -> Result<(), E>
where
F: FnMut(&str) -> Result<(), E>,
{
if self.is_empty() {
return Ok(());
}
if with_ext {
f(TRANSFORM_EXT_STR)?;
}
if let Some(lang) = &self.lang {
lang.for_each_subtag_str_lowercased(f)?;
}
self.fields.for_each_subtag_str(f)
}
}
/// ✨ *Enabled with the `alloc` Cargo feature.*
#[cfg(feature = "alloc")]
impl FromStr for Transform {
type Err = ParseError;
#[inline]
fn from_str(s: &str) -> Result<Self, Self::Err> {
Self::try_from_str(s)
}
}
writeable::impl_display_with_writeable!(Transform, #[cfg(feature = "alloc")]);
impl writeable::Writeable for Transform {
fn write_to<W: core::fmt::Write + ?Sized>(&self, sink: &mut W) -> core::fmt::Result {
if self.is_empty() {
return Ok(());
}
sink.write_char(TRANSFORM_EXT_CHAR)?;
if let Some(lang) = &self.lang {
sink.write_char('-')?;
lang.write_lowercased_to(sink)?;
}
if !self.fields.is_empty() {
sink.write_char('-')?;
writeable::Writeable::write_to(&self.fields, sink)?;
}
Ok(())
}
fn writeable_length_hint(&self) -> writeable::LengthHint {
if self.is_empty() {
return writeable::LengthHint::exact(0);
}
let mut result = writeable::LengthHint::exact(1);
if let Some(lang) = &self.lang {
result += writeable::Writeable::writeable_length_hint(lang) + 1;
}
if !self.fields.is_empty() {
result += writeable::Writeable::writeable_length_hint(&self.fields) + 1;
}
result
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_transform_extension_fromstr() {
let te: Transform = "t-en-us-h0-hybrid"
.parse()
.expect("Failed to parse Transform");
assert_eq!(te.to_string(), "t-en-us-h0-hybrid");
let te: Result<Transform, _> = "t".parse();
assert!(te.is_err());
}
}

View File

@@ -0,0 +1,165 @@
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
use crate::parser::ParseError;
#[cfg(feature = "alloc")]
use crate::parser::SubtagIterator;
use crate::shortvec::ShortBoxSlice;
use crate::subtags::{subtag, Subtag};
use core::ops::RangeInclusive;
#[cfg(feature = "alloc")]
use core::str::FromStr;
/// A value used in a list of [`Fields`](super::Fields).
///
/// The value has to be a sequence of one or more alphanumerical strings
/// separated by `-`.
/// Each part of the sequence has to be no shorter than three characters and no
/// longer than 8.
///
/// # Examples
///
/// ```
/// use icu::locale::extensions::transform::Value;
///
/// "hybrid".parse::<Value>().expect("Valid Value.");
///
/// "hybrid-foobar".parse::<Value>().expect("Valid Value.");
///
/// "no".parse::<Value>().expect_err("Invalid Value.");
/// ```
#[derive(Debug, PartialEq, Eq, Clone, Hash, PartialOrd, Ord, Default)]
pub struct Value(ShortBoxSlice<Subtag>);
#[allow(dead_code)]
const TYPE_LENGTH: RangeInclusive<usize> = 3..=8;
const TRUE_TVALUE: Subtag = subtag!("true");
impl Value {
/// A constructor which takes a str slice, parses it and
/// produces a well-formed [`Value`].
///
/// ✨ *Enabled with the `alloc` Cargo feature.*
///
/// # Examples
///
/// ```
/// use icu::locale::extensions::transform::Value;
///
/// let value = Value::try_from_str("hybrid").expect("Parsing failed.");
/// ```
#[inline]
#[cfg(feature = "alloc")]
pub fn try_from_str(s: &str) -> Result<Self, ParseError> {
Self::try_from_utf8(s.as_bytes())
}
/// See [`Self::try_from_str`]
///
/// ✨ *Enabled with the `alloc` Cargo feature.*
#[cfg(feature = "alloc")]
pub fn try_from_utf8(code_units: &[u8]) -> Result<Self, ParseError> {
let mut v = ShortBoxSlice::default();
let mut has_value = false;
for subtag in SubtagIterator::new(code_units) {
if !Self::is_type_subtag(subtag) {
return Err(ParseError::InvalidExtension);
}
has_value = true;
let val = Subtag::try_from_utf8(subtag).map_err(|_| ParseError::InvalidExtension)?;
if val != TRUE_TVALUE {
v.push(val);
}
}
if !has_value {
return Err(ParseError::InvalidExtension);
}
Ok(Self(v))
}
#[allow(dead_code)]
pub(crate) fn from_short_slice_unchecked(input: ShortBoxSlice<Subtag>) -> Self {
Self(input)
}
#[allow(dead_code)]
pub(crate) fn is_type_subtag(t: &[u8]) -> bool {
TYPE_LENGTH.contains(&t.len()) && t.iter().all(u8::is_ascii_alphanumeric)
}
#[allow(dead_code)]
pub(crate) fn parse_subtag(t: &[u8]) -> Result<Option<Subtag>, ParseError> {
if !TYPE_LENGTH.contains(&t.len()) {
return Err(ParseError::InvalidExtension);
}
let s = Subtag::try_from_utf8(t).map_err(|_| ParseError::InvalidSubtag)?;
let s = s.to_ascii_lowercase();
if s == TRUE_TVALUE {
Ok(None)
} else {
Ok(Some(s))
}
}
pub(crate) fn for_each_subtag_str<E, F>(&self, f: &mut F) -> Result<(), E>
where
F: FnMut(&str) -> Result<(), E>,
{
if self.0.is_empty() {
f(TRUE_TVALUE.as_str())?;
} else {
self.0.iter().map(Subtag::as_str).try_for_each(f)?;
}
Ok(())
}
}
/// ✨ *Enabled with the `alloc` Cargo feature.*
#[cfg(feature = "alloc")]
impl FromStr for Value {
type Err = ParseError;
#[inline]
fn from_str(s: &str) -> Result<Self, Self::Err> {
Self::try_from_str(s)
}
}
impl_writeable_for_each_subtag_str_no_test!(Value, selff, selff.0.is_empty() => Some("true"));
#[test]
fn test_writeable() {
use writeable::assert_writeable_eq;
let hybrid = "hybrid".parse().unwrap();
let foobar = "foobar".parse().unwrap();
assert_writeable_eq!(Value::default(), "true");
assert_writeable_eq!(
Value::from_short_slice_unchecked(vec![hybrid].into()),
"hybrid"
);
assert_writeable_eq!(
Value::from_short_slice_unchecked(vec![hybrid, foobar].into()),
"hybrid-foobar"
);
}
#[test]
fn test_short_tvalue() {
let value = Value::try_from_str("foo-longstag");
assert!(value.is_ok());
let value = value.unwrap();
assert_eq!(value.0.len(), 2);
for (s, reference) in value.0.iter().zip(&[subtag!("foo"), subtag!("longstag")]) {
assert_eq!(s, reference);
}
let value = Value::try_from_str("foo-ba");
assert!(value.is_err());
}

View File

@@ -0,0 +1,34 @@
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
impl_tinystr_subtag!(
/// An attribute used in a set of [`Attributes`](super::Attributes).
///
/// An attribute has to be a sequence of alphanumerical characters no
/// shorter than three and no longer than eight characters.
///
///
/// # Examples
///
/// ```
/// use icu::locale::extensions::unicode::{attribute, Attribute};
///
/// let attr: Attribute =
/// "buddhist".parse().expect("Failed to parse an Attribute.");
///
/// assert_eq!(attr, attribute!("buddhist"));
/// ```
Attribute,
extensions::unicode,
attribute,
extensions_unicode_attribute,
3..=8,
s,
s.is_ascii_alphanumeric(),
s.to_ascii_lowercase(),
s.is_ascii_alphanumeric() && s.is_ascii_lowercase(),
InvalidExtension,
["foo12"],
["no", "toolooong"],
);

View File

@@ -0,0 +1,206 @@
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
use super::Attribute;
#[cfg(feature = "alloc")]
use crate::parser::SubtagIterator;
use crate::shortvec::ShortBoxSlice;
#[cfg(feature = "alloc")]
use crate::ParseError;
#[cfg(feature = "alloc")]
use alloc::vec::Vec;
use core::ops::Deref;
#[cfg(feature = "alloc")]
use core::str::FromStr;
/// A set of [`Attribute`] elements as defined in [`Unicode Extension Attributes`].
///
/// [`Unicode Extension Attributes`]: https://unicode.org/reports/tr35/tr35.html#u_Extension
///
/// # Examples
///
/// ```
/// use icu::locale::extensions::unicode::{Attribute, Attributes};
///
/// let attribute1: Attribute =
/// "foobar".parse().expect("Failed to parse a variant subtag.");
///
/// let attribute2: Attribute = "testing"
/// .parse()
/// .expect("Failed to parse a variant subtag.");
/// let mut v = vec![attribute1, attribute2];
/// v.sort();
/// v.dedup();
///
/// let attributes: Attributes = Attributes::from_vec_unchecked(v);
/// assert_eq!(attributes.to_string(), "foobar-testing");
/// ```
#[derive(Default, Debug, PartialEq, Eq, Clone, Hash, PartialOrd, Ord)]
pub struct Attributes(ShortBoxSlice<Attribute>);
impl Attributes {
/// Returns a new empty set of attributes. Same as [`default()`](Default::default()), but is `const`.
///
/// # Examples
///
/// ```
/// use icu::locale::extensions::unicode::Attributes;
///
/// assert_eq!(Attributes::new(), Attributes::default());
/// ```
#[inline]
pub const fn new() -> Self {
Self(ShortBoxSlice::new())
}
/// A constructor which takes a str slice, parses it and
/// produces a well-formed [`Attributes`].
///
/// ✨ *Enabled with the `alloc` Cargo feature.*
#[inline]
#[cfg(feature = "alloc")]
pub fn try_from_str(s: &str) -> Result<Self, ParseError> {
Self::try_from_utf8(s.as_bytes())
}
/// See [`Self::try_from_str`]
///
/// ✨ *Enabled with the `alloc` Cargo feature.*
#[cfg(feature = "alloc")]
pub fn try_from_utf8(code_units: &[u8]) -> Result<Self, ParseError> {
let mut iter = SubtagIterator::new(code_units);
Self::try_from_iter(&mut iter)
}
/// A constructor which takes a pre-sorted list of [`Attribute`] elements.
///
/// ✨ *Enabled with the `alloc` Cargo feature.*
///
/// # Examples
///
/// ```
/// use icu::locale::extensions::unicode::{Attribute, Attributes};
///
/// let attribute1: Attribute = "foobar".parse().expect("Parsing failed.");
/// let attribute2: Attribute = "testing".parse().expect("Parsing failed.");
/// let mut v = vec![attribute1, attribute2];
/// v.sort();
/// v.dedup();
///
/// let attributes = Attributes::from_vec_unchecked(v);
/// ```
///
/// Notice: For performance- and memory-constrained environments, it is recommended
/// for the caller to use [`binary_search`](slice::binary_search) instead of [`sort`](slice::sort)
/// and [`dedup`](Vec::dedup()).
#[cfg(feature = "alloc")]
pub fn from_vec_unchecked(input: Vec<Attribute>) -> Self {
Self(input.into())
}
/// Empties the [`Attributes`] list.
///
/// Returns the old list.
///
/// # Examples
///
/// ```
/// use icu::locale::extensions::unicode::{attribute, Attributes};
/// use writeable::assert_writeable_eq;
///
/// let mut attributes = Attributes::from_vec_unchecked(vec![
/// attribute!("foobar"),
/// attribute!("testing"),
/// ]);
///
/// assert_writeable_eq!(attributes, "foobar-testing");
///
/// attributes.clear();
///
/// assert_writeable_eq!(attributes, "");
/// ```
pub fn clear(&mut self) -> Self {
core::mem::take(self)
}
#[cfg(feature = "alloc")]
pub(crate) fn try_from_iter(iter: &mut SubtagIterator) -> Result<Self, ParseError> {
let mut attributes = ShortBoxSlice::new();
while let Some(subtag) = iter.peek() {
if let Ok(attr) = Attribute::try_from_utf8(subtag) {
if let Err(idx) = attributes.binary_search(&attr) {
attributes.insert(idx, attr);
}
} else {
break;
}
iter.next();
}
Ok(Self(attributes))
}
pub(crate) fn for_each_subtag_str<E, F>(&self, f: &mut F) -> Result<(), E>
where
F: FnMut(&str) -> Result<(), E>,
{
self.deref().iter().map(|t| t.as_str()).try_for_each(f)
}
/// Extends the `Attributes` with values from another `Attributes`.
///
/// # Example
///
/// ```
/// use icu::locale::extensions::unicode::Attributes;
///
/// let mut attrs: Attributes = "foobar-foobaz".parse().unwrap();
/// let attrs2: Attributes = "foobar-fooqux".parse().unwrap();
///
/// attrs.extend_from_attributes(attrs2);
///
/// assert_eq!(attrs, "foobar-foobaz-fooqux".parse().unwrap());
/// ```
#[cfg(feature = "alloc")]
pub fn extend_from_attributes(&mut self, other: Attributes) {
for attr in other.0 {
if let Err(idx) = self.binary_search(&attr) {
self.0.insert(idx, attr);
}
}
}
}
/// ✨ *Enabled with the `alloc` Cargo feature.*
#[cfg(feature = "alloc")]
impl FromStr for Attributes {
type Err = ParseError;
#[inline]
fn from_str(s: &str) -> Result<Self, Self::Err> {
Self::try_from_str(s)
}
}
impl_writeable_for_subtag_list!(Attributes, "foobar", "testing");
impl Deref for Attributes {
type Target = [Attribute];
fn deref(&self) -> &[Attribute] {
self.0.deref()
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_attributes_fromstr() {
let attrs: Attributes = "foo-bar".parse().expect("Failed to parse Attributes");
assert_eq!(attrs.to_string(), "bar-foo");
}
}

View File

@@ -0,0 +1,32 @@
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
impl_tinystr_subtag!(
/// A key used in a list of [`Keywords`](super::Keywords).
///
/// The key has to be a two ASCII alphanumerical characters long, with the first
/// character being alphanumeric, and the second being alphabetic.
///
///
/// # Examples
///
/// ```
/// use icu::locale::extensions::unicode::Key;
///
/// assert!("ca".parse::<Key>().is_ok());
/// ```
Key,
extensions::unicode,
key,
extensions_unicode_key,
2..=2,
s,
s.all_bytes()[0].is_ascii_alphanumeric() && s.all_bytes()[1].is_ascii_alphabetic(),
s.to_ascii_lowercase(),
(s.all_bytes()[0].is_ascii_lowercase() || s.all_bytes()[0].is_ascii_digit())
&& s.all_bytes()[1].is_ascii_lowercase(),
InvalidExtension,
["ca", "8a"],
["a", "a8", "abc"],
);

View File

@@ -0,0 +1,453 @@
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
use core::borrow::Borrow;
use core::cmp::Ordering;
#[cfg(feature = "alloc")]
use core::iter::FromIterator;
#[cfg(feature = "alloc")]
use core::str::FromStr;
use litemap::LiteMap;
use super::Key;
use super::Value;
#[cfg(feature = "alloc")]
use crate::parser::ParseError;
#[cfg(feature = "alloc")]
use crate::parser::SubtagIterator;
use crate::shortvec::ShortBoxSlice;
/// A list of [`Key`]-[`Value`] pairs representing functional information
/// about locale's internationalization preferences.
///
/// Here are examples of fields used in Unicode:
/// - `hc` - Hour Cycle (`h11`, `h12`, `h23`, `h24`)
/// - `ca` - Calendar (`buddhist`, `gregory`, ...)
/// - `fw` - First Day Of the Week (`sun`, `mon`, `sat`, ...)
///
/// You can find the full list in [`Unicode BCP 47 U Extension`] section of LDML.
///
/// [`Unicode BCP 47 U Extension`]: https://unicode.org/reports/tr35/tr35.html#Key_And_Type_Definitions_
///
/// # Examples
///
/// Manually build up a [`Keywords`] object:
///
/// ```
/// use icu::locale::extensions::unicode::{key, value, Keywords};
///
/// let keywords = [(key!("hc"), value!("h23"))]
/// .into_iter()
/// .collect::<Keywords>();
///
/// assert_eq!(&keywords.to_string(), "hc-h23");
/// ```
///
/// Access a [`Keywords`] object from a [`Locale`]:
///
/// ```
/// use icu::locale::{
/// extensions::unicode::{key, value},
/// Locale,
/// };
///
/// let loc: Locale = "und-u-hc-h23-kc-true".parse().expect("Valid BCP-47");
///
/// assert_eq!(loc.extensions.unicode.keywords.get(&key!("ca")), None);
/// assert_eq!(
/// loc.extensions.unicode.keywords.get(&key!("hc")),
/// Some(&value!("h23"))
/// );
/// assert_eq!(
/// loc.extensions.unicode.keywords.get(&key!("kc")),
/// Some(&value!("true"))
/// );
///
/// assert_eq!(loc.extensions.unicode.keywords.to_string(), "hc-h23-kc");
/// ```
///
/// [`Locale`]: crate::Locale
#[derive(Clone, PartialEq, Eq, Debug, Default, Hash, PartialOrd, Ord)]
pub struct Keywords(LiteMap<Key, Value, ShortBoxSlice<(Key, Value)>>);
impl Keywords {
/// Returns a new empty list of key-value pairs. Same as [`default()`](Default::default()), but is `const`.
///
/// # Examples
///
/// ```
/// use icu::locale::extensions::unicode::Keywords;
///
/// assert_eq!(Keywords::new(), Keywords::default());
/// ```
#[inline]
pub const fn new() -> Self {
Self(LiteMap::new())
}
/// Create a new list of key-value pairs having exactly one pair, callable in a `const` context.
#[inline]
pub const fn new_single(key: Key, value: Value) -> Self {
Self(LiteMap::from_sorted_store_unchecked(
ShortBoxSlice::new_single((key, value)),
))
}
/// A constructor which takes a str slice, parses it and
/// produces a well-formed [`Keywords`].
///
/// ✨ *Enabled with the `alloc` Cargo feature.*
#[inline]
#[cfg(feature = "alloc")]
pub fn try_from_str(s: &str) -> Result<Self, ParseError> {
Self::try_from_utf8(s.as_bytes())
}
/// See [`Self::try_from_str`]
///
/// ✨ *Enabled with the `alloc` Cargo feature.*
#[cfg(feature = "alloc")]
pub fn try_from_utf8(code_units: &[u8]) -> Result<Self, ParseError> {
let mut iter = SubtagIterator::new(code_units);
Self::try_from_iter(&mut iter)
}
/// Returns `true` if there are no keywords.
///
/// # Examples
///
/// ```
/// use icu::locale::locale;
/// use icu::locale::Locale;
///
/// let loc1 = Locale::try_from_str("und-t-h0-hybrid").unwrap();
/// let loc2 = locale!("und-u-ca-buddhist");
///
/// assert!(loc1.extensions.unicode.keywords.is_empty());
/// assert!(!loc2.extensions.unicode.keywords.is_empty());
/// ```
pub fn is_empty(&self) -> bool {
self.0.is_empty()
}
/// Returns `true` if the list contains a [`Value`] for the specified [`Key`].
///
///
/// # Examples
///
/// ```
/// use icu::locale::extensions::unicode::{key, value, Keywords};
///
/// let keywords = [(key!("ca"), value!("gregory"))]
/// .into_iter()
/// .collect::<Keywords>();
///
/// assert!(&keywords.contains_key(&key!("ca")));
/// ```
pub fn contains_key<Q>(&self, key: &Q) -> bool
where
Key: Borrow<Q>,
Q: Ord,
{
self.0.contains_key(key)
}
/// Returns a reference to the [`Value`] corresponding to the [`Key`].
///
///
/// # Examples
///
/// ```
/// use icu::locale::extensions::unicode::{key, value, Keywords};
///
/// let keywords = [(key!("ca"), value!("buddhist"))]
/// .into_iter()
/// .collect::<Keywords>();
///
/// assert_eq!(keywords.get(&key!("ca")), Some(&value!("buddhist")));
/// ```
pub fn get<Q>(&self, key: &Q) -> Option<&Value>
where
Key: Borrow<Q>,
Q: Ord,
{
self.0.get(key)
}
/// Returns a mutable reference to the [`Value`] corresponding to the [`Key`].
///
/// Returns `None` if the key doesn't exist or if the key has no value.
///
/// ✨ *Enabled with the `alloc` Cargo feature.*
///
/// # Examples
///
/// ```
/// use icu::locale::extensions::unicode::{key, value, Keywords};
///
/// let mut keywords = [(key!("ca"), value!("buddhist"))]
/// .into_iter()
/// .collect::<Keywords>();
///
/// if let Some(value) = keywords.get_mut(&key!("ca")) {
/// *value = value!("gregory");
/// }
/// assert_eq!(keywords.get(&key!("ca")), Some(&value!("gregory")));
/// ```
#[cfg(feature = "alloc")]
pub fn get_mut<Q>(&mut self, key: &Q) -> Option<&mut Value>
where
Key: Borrow<Q>,
Q: Ord,
{
self.0.get_mut(key)
}
/// Sets the specified keyword, returning the old value if it already existed.
///
/// ✨ *Enabled with the `alloc` Cargo feature.*
///
/// # Examples
///
/// ```
/// use icu::locale::extensions::unicode::{key, value};
/// use icu::locale::Locale;
///
/// let mut loc: Locale = "und-u-hello-ca-buddhist-hc-h12"
/// .parse()
/// .expect("valid BCP-47 identifier");
/// let old_value = loc
/// .extensions
/// .unicode
/// .keywords
/// .set(key!("ca"), value!("japanese"));
///
/// assert_eq!(old_value, Some(value!("buddhist")));
/// assert_eq!(loc, "und-u-hello-ca-japanese-hc-h12".parse().unwrap());
/// ```
#[cfg(feature = "alloc")]
pub fn set(&mut self, key: Key, value: Value) -> Option<Value> {
self.0.insert(key, value)
}
/// Removes the specified keyword, returning the old value if it existed.
///
/// ✨ *Enabled with the `alloc` Cargo feature.*
///
/// # Examples
///
/// ```
/// use icu::locale::extensions::unicode::key;
/// use icu::locale::Locale;
///
/// let mut loc: Locale = "und-u-hello-ca-buddhist-hc-h12"
/// .parse()
/// .expect("valid BCP-47 identifier");
/// loc.extensions.unicode.keywords.remove(key!("ca"));
/// assert_eq!(loc, "und-u-hello-hc-h12".parse().unwrap());
/// ```
#[cfg(feature = "alloc")]
pub fn remove<Q: Borrow<Key>>(&mut self, key: Q) -> Option<Value> {
self.0.remove(key.borrow())
}
/// Clears all Unicode extension keywords, leaving Unicode attributes.
///
/// Returns the old Unicode extension keywords.
///
/// # Examples
///
/// ```
/// use icu::locale::Locale;
///
/// let mut loc: Locale = "und-u-hello-ca-buddhist-hc-h12".parse().unwrap();
/// loc.extensions.unicode.keywords.clear();
/// assert_eq!(loc, "und-u-hello".parse().unwrap());
/// ```
pub fn clear(&mut self) -> Self {
core::mem::take(self)
}
/// Retains a subset of keywords as specified by the predicate function.
///
/// ✨ *Enabled with the `alloc` Cargo feature.*
///
/// # Examples
///
/// ```
/// use icu::locale::extensions::unicode::key;
/// use icu::locale::Locale;
///
/// let mut loc: Locale = "und-u-ca-buddhist-hc-h12-ms-metric".parse().unwrap();
///
/// loc.extensions
/// .unicode
/// .keywords
/// .retain_by_key(|&k| k == key!("hc"));
/// assert_eq!(loc, "und-u-hc-h12".parse().unwrap());
///
/// loc.extensions
/// .unicode
/// .keywords
/// .retain_by_key(|&k| k == key!("ms"));
/// assert_eq!(loc, Locale::UNKNOWN);
/// ```
#[cfg(feature = "alloc")]
pub fn retain_by_key<F>(&mut self, mut predicate: F)
where
F: FnMut(&Key) -> bool,
{
self.0.retain(|k, _| predicate(k))
}
/// Compare this [`Keywords`] with BCP-47 bytes.
///
/// The return value is equivalent to what would happen if you first converted this
/// [`Keywords`] to a BCP-47 string and then performed a byte comparison.
///
/// This function is case-sensitive and results in a *total order*, so it is appropriate for
/// binary search. The only argument producing [`Ordering::Equal`] is `self.to_string()`.
///
/// # Examples
///
/// ```
/// use icu::locale::Locale;
/// use std::cmp::Ordering;
///
/// let bcp47_strings: &[&str] =
/// &["ca-hebrew", "ca-japanese", "ca-japanese-nu-latn", "nu-latn"];
///
/// for ab in bcp47_strings.windows(2) {
/// let a = ab[0];
/// let b = ab[1];
/// assert!(a.cmp(b) == Ordering::Less);
/// let a_kwds = format!("und-u-{}", a)
/// .parse::<Locale>()
/// .unwrap()
/// .extensions
/// .unicode
/// .keywords;
/// assert!(a_kwds.strict_cmp(a.as_bytes()) == Ordering::Equal);
/// assert!(a_kwds.strict_cmp(b.as_bytes()) == Ordering::Less);
/// }
/// ```
pub fn strict_cmp(&self, other: &[u8]) -> Ordering {
writeable::cmp_utf8(self, other)
}
#[cfg(feature = "alloc")]
pub(crate) fn try_from_iter(iter: &mut SubtagIterator) -> Result<Self, ParseError> {
let mut keywords = LiteMap::new();
let mut current_keyword = None;
let mut current_value = ShortBoxSlice::new();
while let Some(subtag) = iter.peek() {
let slen = subtag.len();
if slen == 2 {
if let Some(kw) = current_keyword.take() {
keywords.try_insert(kw, Value::from_short_slice_unchecked(current_value));
current_value = ShortBoxSlice::new();
}
current_keyword = Some(Key::try_from_utf8(subtag)?);
} else if current_keyword.is_some() {
match Value::parse_subtag_from_utf8(subtag) {
Ok(Some(t)) => current_value.push(t),
Ok(None) => {}
Err(_) => break,
}
} else {
break;
}
iter.next();
}
if let Some(kw) = current_keyword.take() {
keywords.try_insert(kw, Value::from_short_slice_unchecked(current_value));
}
Ok(keywords.into())
}
/// Produce an ordered iterator over key-value pairs
pub fn iter(&self) -> impl Iterator<Item = (&Key, &Value)> {
self.0.iter()
}
pub(crate) fn for_each_subtag_str<E, F>(&self, f: &mut F) -> Result<(), E>
where
F: FnMut(&str) -> Result<(), E>,
{
for (k, v) in self.0.iter() {
f(k.as_str())?;
v.for_each_subtag_str(f)?;
}
Ok(())
}
/// Extends the `Keywords` with values from another `Keywords`.
///
/// # Example
///
/// ```
/// use icu::locale::extensions::unicode::Keywords;
///
/// let mut kw: Keywords = "ab-cd-ca-buddhist".parse().unwrap();
/// let kw2: Keywords = "ca-gregory-hc-h12".parse().unwrap();
///
/// kw.extend_from_keywords(kw2);
///
/// assert_eq!(kw, "ab-cd-ca-gregory-hc-h12".parse().unwrap());
/// ```
#[cfg(feature = "alloc")]
pub fn extend_from_keywords(&mut self, other: Keywords) {
for (key, value) in other.0 {
self.0.insert(key, value);
}
}
/// This needs to be its own method to help with type inference in helpers.rs
#[cfg(test)]
pub(crate) fn from_tuple_vec(v: Vec<(Key, Value)>) -> Self {
v.into_iter().collect()
}
}
impl From<LiteMap<Key, Value, ShortBoxSlice<(Key, Value)>>> for Keywords {
fn from(map: LiteMap<Key, Value, ShortBoxSlice<(Key, Value)>>) -> Self {
Self(map)
}
}
/// ✨ *Enabled with the `alloc` Cargo feature.*
#[cfg(feature = "alloc")]
impl FromIterator<(Key, Value)> for Keywords {
fn from_iter<I: IntoIterator<Item = (Key, Value)>>(iter: I) -> Self {
LiteMap::from_iter(iter).into()
}
}
/// ✨ *Enabled with the `alloc` Cargo feature.*
#[cfg(feature = "alloc")]
impl FromStr for Keywords {
type Err = ParseError;
#[inline]
fn from_str(s: &str) -> Result<Self, Self::Err> {
Self::try_from_str(s)
}
}
impl_writeable_for_key_value!(Keywords, "ca", "islamic-civil", "mm", "mm");
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_keywords_fromstr() {
let kw: Keywords = "hc-h12".parse().expect("Failed to parse Keywords");
assert_eq!(kw.to_string(), "hc-h12");
}
}

View File

@@ -0,0 +1,294 @@
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
//! Unicode Extensions provide information about user preferences in a given locale.
//!
//! The main struct for this extension is [`Unicode`] which contains [`Keywords`] and
//! [`Attributes`].
//!
//!
//! # Examples
//!
//! ```
//! use icu::locale::extensions::unicode::{attribute, key, value, Unicode};
//! use icu::locale::Locale;
//!
//! let loc: Locale = "en-US-u-foobar-hc-h12".parse().expect("Parsing failed.");
//!
//! assert_eq!(
//! loc.extensions.unicode.keywords.get(&key!("hc")),
//! Some(&value!("h12"))
//! );
//! assert!(loc
//! .extensions
//! .unicode
//! .attributes
//! .contains(&attribute!("foobar")));
//! ```
mod attribute;
mod attributes;
mod key;
mod keywords;
mod subdivision;
mod value;
use core::cmp::Ordering;
#[cfg(feature = "alloc")]
use core::str::FromStr;
#[doc(inline)]
pub use attribute::{attribute, Attribute};
pub use attributes::Attributes;
#[doc(inline)]
pub use key::{key, Key};
pub use keywords::Keywords;
#[doc(inline)]
pub use subdivision::{subdivision_suffix, SubdivisionId, SubdivisionSuffix};
#[doc(inline)]
pub use value::{value, Value};
#[cfg(feature = "alloc")]
use super::ExtensionType;
#[cfg(feature = "alloc")]
use crate::parser::ParseError;
#[cfg(feature = "alloc")]
use crate::parser::SubtagIterator;
pub(crate) const UNICODE_EXT_CHAR: char = 'u';
pub(crate) const UNICODE_EXT_STR: &str = "u";
/// Unicode Extensions provide information about user preferences in a given locale.
///
/// A list of [`Unicode BCP47 U Extensions`] as defined in [`Unicode Locale
/// Identifier`] specification.
///
/// Unicode extensions provide subtags that specify language and/or locale-based behavior
/// or refinements to language tags, according to work done by the Unicode Consortium.
/// (See [`RFC 6067`] for details).
///
/// [`Unicode BCP47 U Extensions`]: https://unicode.org/reports/tr35/#u_Extension
/// [`RFC 6067`]: https://www.ietf.org/rfc/rfc6067.txt
/// [`Unicode Locale Identifier`]: https://unicode.org/reports/tr35/#Unicode_locale_identifier
///
/// # Examples
///
/// ```
/// use icu::locale::extensions::unicode::{key, value};
/// use icu::locale::Locale;
///
/// let loc: Locale =
/// "de-u-hc-h12-ca-buddhist".parse().expect("Parsing failed.");
///
/// assert_eq!(
/// loc.extensions.unicode.keywords.get(&key!("ca")),
/// Some(&value!("buddhist"))
/// );
/// ```
#[derive(Clone, PartialEq, Eq, Debug, Default, Hash)]
#[allow(clippy::exhaustive_structs)] // spec-backed stable datastructure
pub struct Unicode {
/// The key-value pairs present in this locale extension, with each extension key subtag
/// associated to its provided value subtag.
pub keywords: Keywords,
/// A canonically ordered sequence of single standalone subtags for this locale extension.
pub attributes: Attributes,
}
impl Unicode {
/// Returns a new empty map of Unicode extensions. Same as [`default()`](Default::default()), but is `const`.
///
/// # Examples
///
/// ```
/// use icu::locale::extensions::unicode::Unicode;
///
/// assert_eq!(Unicode::new(), Unicode::default());
/// ```
#[inline]
pub const fn new() -> Self {
Self {
keywords: Keywords::new(),
attributes: Attributes::new(),
}
}
/// A constructor which takes a str slice, parses it and
/// produces a well-formed [`Unicode`].
///
/// ✨ *Enabled with the `alloc` Cargo feature.*
#[inline]
#[cfg(feature = "alloc")]
pub fn try_from_str(s: &str) -> Result<Self, ParseError> {
Self::try_from_utf8(s.as_bytes())
}
/// See [`Self::try_from_str`]
///
/// ✨ *Enabled with the `alloc` Cargo feature.*
#[cfg(feature = "alloc")]
pub fn try_from_utf8(code_units: &[u8]) -> Result<Self, ParseError> {
let mut iter = SubtagIterator::new(code_units);
let ext = iter.next().ok_or(ParseError::InvalidExtension)?;
if let ExtensionType::Unicode = ExtensionType::try_from_byte_slice(ext)? {
return Self::try_from_iter(&mut iter);
}
Err(ParseError::InvalidExtension)
}
/// Returns [`true`] if there list of keywords and attributes is empty.
///
/// # Examples
///
/// ```
/// use icu::locale::Locale;
///
/// let loc: Locale = "en-US-u-foo".parse().expect("Parsing failed.");
///
/// assert!(!loc.extensions.unicode.is_empty());
/// ```
pub fn is_empty(&self) -> bool {
self.keywords.is_empty() && self.attributes.is_empty()
}
/// Clears all Unicode extension keywords and attributes, effectively removing
/// the Unicode extension.
///
/// # Example
///
/// ```
/// use icu::locale::Locale;
///
/// let mut loc: Locale =
/// "und-t-mul-u-hello-ca-buddhist-hc-h12".parse().unwrap();
/// loc.extensions.unicode.clear();
/// assert_eq!(loc, "und-t-mul".parse().unwrap());
/// ```
pub fn clear(&mut self) {
self.keywords.clear();
self.attributes.clear();
}
pub(crate) fn as_tuple(&self) -> (&Attributes, &Keywords) {
(&self.attributes, &self.keywords)
}
/// Returns an ordering suitable for use in [`BTreeSet`].
///
/// The ordering may or may not be equivalent to string ordering, and it
/// may or may not be stable across ICU4X releases.
///
/// [`BTreeSet`]: alloc::collections::BTreeSet
pub fn total_cmp(&self, other: &Self) -> Ordering {
self.as_tuple().cmp(&other.as_tuple())
}
#[cfg(feature = "alloc")]
pub(crate) fn try_from_iter(iter: &mut SubtagIterator) -> Result<Self, ParseError> {
let attributes = Attributes::try_from_iter(iter)?;
let keywords = Keywords::try_from_iter(iter)?;
// Ensure we've defined at least one attribute or keyword
if attributes.is_empty() && keywords.is_empty() {
return Err(ParseError::InvalidExtension);
}
Ok(Self {
keywords,
attributes,
})
}
pub(crate) fn for_each_subtag_str<E, F>(&self, f: &mut F, with_ext: bool) -> Result<(), E>
where
F: FnMut(&str) -> Result<(), E>,
{
if !self.is_empty() {
if with_ext {
f(UNICODE_EXT_STR)?;
}
self.attributes.for_each_subtag_str(f)?;
self.keywords.for_each_subtag_str(f)?;
}
Ok(())
}
/// Extends the `Unicode` with values from another `Unicode`.
///
/// # Example
///
/// ```
/// use icu::locale::extensions::unicode::Unicode;
///
/// let mut ue: Unicode = "u-foobar-ca-buddhist".parse().unwrap();
/// let ue2: Unicode = "u-ca-gregory-hc-h12".parse().unwrap();
///
/// ue.extend(ue2);
///
/// assert_eq!(ue, "u-foobar-ca-gregory-hc-h12".parse().unwrap());
/// ```
#[cfg(feature = "alloc")]
pub fn extend(&mut self, other: Unicode) {
self.keywords.extend_from_keywords(other.keywords);
self.attributes.extend_from_attributes(other.attributes);
}
}
/// ✨ *Enabled with the `alloc` Cargo feature.*
#[cfg(feature = "alloc")]
impl FromStr for Unicode {
type Err = ParseError;
#[inline]
fn from_str(s: &str) -> Result<Self, Self::Err> {
Self::try_from_str(s)
}
}
writeable::impl_display_with_writeable!(Unicode, #[cfg(feature = "alloc")]);
impl writeable::Writeable for Unicode {
fn write_to<W: core::fmt::Write + ?Sized>(&self, sink: &mut W) -> core::fmt::Result {
sink.write_char(UNICODE_EXT_CHAR)?;
if !self.attributes.is_empty() {
sink.write_char('-')?;
writeable::Writeable::write_to(&self.attributes, sink)?;
}
if !self.keywords.is_empty() {
sink.write_char('-')?;
writeable::Writeable::write_to(&self.keywords, sink)?;
}
Ok(())
}
fn writeable_length_hint(&self) -> writeable::LengthHint {
if self.is_empty() {
return writeable::LengthHint::exact(0);
}
let mut result = writeable::LengthHint::exact(1);
if !self.attributes.is_empty() {
result += writeable::Writeable::writeable_length_hint(&self.attributes) + 1;
}
if !self.keywords.is_empty() {
result += writeable::Writeable::writeable_length_hint(&self.keywords) + 1;
}
result
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_unicode_extension_fromstr() {
let ue: Unicode = "u-foo-hc-h12".parse().expect("Failed to parse Unicode");
assert_eq!(ue.to_string(), "u-foo-hc-h12");
let ue: Result<Unicode, _> = "u".parse();
assert!(ue.is_err());
}
}

View File

@@ -0,0 +1,181 @@
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
use core::str::FromStr;
use crate::parser::ParseError;
use crate::subtags::{Region, Subtag};
impl_tinystr_subtag!(
/// A subdivision suffix used in [`SubdivisionId`].
///
/// This suffix represents a specific subdivision code under a given [`Region`].
/// For example the value of [`SubdivisionId`] may be `gbsct`, where the [`SubdivisionSuffix`]
/// is `sct` for Scotland.
///
/// Such a value associated with a key `rg` means that the locale should use Unit Preferences
/// (default calendar, currency, week data, time cycle, measurement system) for Scotland, even if the
/// [`LanguageIdentifier`](crate::LanguageIdentifier) is `en-US`.
///
/// A subdivision suffix has to be a sequence of alphanumerical characters no
/// shorter than one and no longer than four characters.
///
///
/// # Examples
///
/// ```
/// use icu::locale::extensions::unicode::{subdivision_suffix, SubdivisionSuffix};
///
/// let ss: SubdivisionSuffix =
/// "sct".parse().expect("Failed to parse a SubdivisionSuffix.");
///
/// assert_eq!(ss, subdivision_suffix!("sct"));
/// ```
SubdivisionSuffix,
extensions::unicode,
subdivision_suffix,
extensions_unicode_subdivision_suffix,
1..=4,
s,
s.is_ascii_alphanumeric(),
s.to_ascii_lowercase(),
s.is_ascii_alphanumeric() && s.is_ascii_lowercase(),
InvalidExtension,
["sct"],
["toolooong"],
);
/// A Subivision Id as defined in [`Unicode Locale Identifier`].
///
/// Subdivision Id is used in [`Unicode`] extensions:
/// * `rg` - Regional Override
/// * `sd` - Regional Subdivision
///
/// In both cases the subdivision is composed of a [`Region`] and a [`SubdivisionSuffix`] which represents
/// different meaning depending on the key.
///
/// [`Unicode Locale Identifier`]: https://unicode.org/reports/tr35/tr35.html#unicode_subdivision_id
/// [`Unicode`]: crate::extensions::unicode::Unicode
///
/// # Examples
///
/// ```
/// use icu::locale::{
/// extensions::unicode::{subdivision_suffix, SubdivisionId},
/// subtags::region,
/// };
///
/// let ss = subdivision_suffix!("zzzz");
/// let region = region!("gb");
///
/// let si = SubdivisionId::new(region, ss);
///
/// assert_eq!(si.to_string(), "gbzzzz");
/// ```
#[derive(Debug, PartialEq, Eq, Clone, Hash, PartialOrd, Ord, Copy)]
#[non_exhaustive]
pub struct SubdivisionId {
/// A region field of a Subdivision Id.
pub region: Region,
/// A subdivision suffix field of a Subdivision Id.
pub suffix: SubdivisionSuffix,
}
impl SubdivisionId {
/// Returns a new [`SubdivisionId`].
///
/// # Examples
///
/// ```
/// use icu::locale::{
/// extensions::unicode::{subdivision_suffix, SubdivisionId},
/// subtags::region,
/// };
///
/// let ss = subdivision_suffix!("zzzz");
/// let region = region!("gb");
///
/// let si = SubdivisionId::new(region, ss);
///
/// assert_eq!(si.to_string(), "gbzzzz");
/// ```
pub const fn new(region: Region, suffix: SubdivisionSuffix) -> Self {
Self { region, suffix }
}
/// A constructor which takes a str slice, parses it and
/// produces a well-formed [`SubdivisionId`].
#[inline]
pub fn try_from_str(s: &str) -> Result<Self, ParseError> {
Self::try_from_utf8(s.as_bytes())
}
/// See [`Self::try_from_str`]
pub fn try_from_utf8(code_units: &[u8]) -> Result<Self, ParseError> {
let is_alpha = code_units
.first()
.and_then(|b| {
b.is_ascii_alphabetic()
.then_some(true)
.or_else(|| b.is_ascii_digit().then_some(false))
})
.ok_or(ParseError::InvalidExtension)?;
let region_len = if is_alpha { 2 } else { 3 };
let (region_code_units, suffix_code_units) = code_units
.split_at_checked(region_len)
.ok_or(ParseError::InvalidExtension)?;
let region =
Region::try_from_utf8(region_code_units).map_err(|_| ParseError::InvalidExtension)?;
let suffix = SubdivisionSuffix::try_from_utf8(suffix_code_units)?;
Ok(Self { region, suffix })
}
/// Convert to [`Subtag`]
pub fn into_subtag(self) -> Subtag {
let result = self.region.to_tinystr().concat(self.suffix.to_tinystr());
Subtag::from_tinystr_unvalidated(result)
}
}
impl writeable::Writeable for SubdivisionId {
#[inline]
fn write_to<W: core::fmt::Write + ?Sized>(&self, sink: &mut W) -> core::fmt::Result {
sink.write_str(self.region.to_tinystr().to_ascii_lowercase().as_str())?;
sink.write_str(self.suffix.as_str())
}
#[inline]
fn writeable_length_hint(&self) -> writeable::LengthHint {
self.region.writeable_length_hint() + self.suffix.writeable_length_hint()
}
}
writeable::impl_display_with_writeable!(SubdivisionId, #[cfg(feature = "alloc")]);
impl FromStr for SubdivisionId {
type Err = ParseError;
#[inline]
fn from_str(s: &str) -> Result<Self, Self::Err> {
Self::try_from_str(s)
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_subdivisionid_fromstr() {
let si: SubdivisionId = "gbzzzz".parse().expect("Failed to parse SubdivisionId");
assert_eq!(si.region.to_string(), "GB");
assert_eq!(si.suffix.to_string(), "zzzz");
assert_eq!(si.to_string(), "gbzzzz");
for sample in ["", "gb", "o"] {
let oe: Result<SubdivisionId, _> = sample.parse();
assert!(oe.is_err(), "Should fail: {sample}");
}
}
}

View File

@@ -0,0 +1,377 @@
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
use crate::parser::ParseError;
use crate::parser::SubtagIterator;
use crate::shortvec::{ShortBoxSlice, ShortBoxSliceIntoIter};
use crate::subtags::{subtag, Subtag};
#[cfg(feature = "alloc")]
use alloc::vec::Vec;
#[cfg(feature = "alloc")]
use core::str::FromStr;
/// A value used in a list of [`Keywords`](super::Keywords).
///
/// The value has to be a sequence of one or more alphanumerical strings
/// separated by `-`.
/// Each part of the sequence has to be no shorter than three characters and no
/// longer than 8.
///
///
/// # Examples
///
/// ```
/// use icu::locale::extensions::unicode::{value, Value};
/// use writeable::assert_writeable_eq;
///
/// assert_writeable_eq!(value!("gregory"), "gregory");
/// assert_writeable_eq!(
/// "islamic-civil".parse::<Value>().unwrap(),
/// "islamic-civil"
/// );
///
/// // The value "true" has the special, empty string representation
/// assert_eq!(value!("true").to_string(), "");
/// ```
#[derive(Debug, PartialEq, Eq, Clone, Hash, PartialOrd, Ord, Default)]
pub struct Value(ShortBoxSlice<Subtag>);
const TRUE_VALUE: Subtag = subtag!("true");
impl Value {
/// A constructor which str slice, parses it and
/// produces a well-formed [`Value`].
///
/// # Examples
///
/// ```
/// use icu::locale::extensions::unicode::Value;
///
/// Value::try_from_str("buddhist").expect("Parsing failed.");
/// ```
///
/// # `alloc` Cargo feature
///
/// Without the `alloc` Cargo feature, this only supports parsing
/// up to two (non-`true`) subtags, and will return an error for
/// longer strings.
#[inline]
pub fn try_from_str(s: &str) -> Result<Self, ParseError> {
Self::try_from_utf8(s.as_bytes())
}
/// See [`Self::try_from_str`]
pub fn try_from_utf8(code_units: &[u8]) -> Result<Self, ParseError> {
let mut v = ShortBoxSlice::new();
if !code_units.is_empty() {
for chunk in SubtagIterator::new(code_units) {
let subtag = Subtag::try_from_utf8(chunk)?;
if subtag != TRUE_VALUE {
#[cfg(feature = "alloc")]
v.push(subtag);
#[cfg(not(feature = "alloc"))]
if v.is_empty() {
v = ShortBoxSlice::new_single(subtag);
} else if let &[prev] = &*v {
v = ShortBoxSlice::new_double(prev, subtag);
} else {
return Err(ParseError::InvalidSubtag);
}
}
}
}
Ok(Self(v))
}
/// Returns a reference to a single [`Subtag`] if the [`Value`] contains exactly one
/// subtag, or `None` otherwise.
///
/// # Examples
///
/// ```
/// use core::str::FromStr;
/// use icu::locale::extensions::unicode::Value;
///
/// let value1 = Value::from_str("foo").expect("failed to parse a Value");
/// let value2 = Value::from_str("foo-bar").expect("failed to parse a Value");
///
/// assert!(value1.as_single_subtag().is_some());
/// assert!(value2.as_single_subtag().is_none());
/// ```
pub const fn as_single_subtag(&self) -> Option<&Subtag> {
self.0.single()
}
/// Destructs into a single [`Subtag`] if the [`Value`] contains exactly one
/// subtag, or returns `None` otherwise.
///
/// # Examples
///
/// ```
/// use core::str::FromStr;
/// use icu::locale::extensions::unicode::Value;
///
/// let value1 = Value::from_str("foo").expect("failed to parse a Value");
/// let value2 = Value::from_str("foo-bar").expect("failed to parse a Value");
///
/// assert!(value1.into_single_subtag().is_some());
/// assert!(value2.into_single_subtag().is_none());
/// ```
pub fn into_single_subtag(self) -> Option<Subtag> {
self.0.into_single()
}
#[doc(hidden)]
pub fn as_subtags_slice(&self) -> &[Subtag] {
&self.0
}
/// Appends a subtag to the back of a [`Value`].
///
/// ✨ *Enabled with the `alloc` Cargo feature.*
///
/// # Examples
///
/// ```
/// use icu::locale::{extensions::unicode::Value, subtags::subtag};
///
/// let mut v = Value::default();
/// v.push_subtag(subtag!("foo"));
/// v.push_subtag(subtag!("bar"));
/// assert_eq!(v, "foo-bar");
/// ```
#[cfg(feature = "alloc")]
pub fn push_subtag(&mut self, subtag: Subtag) {
self.0.push(subtag);
}
/// Returns the number of subtags in the [`Value`].
///
/// # Examples
///
/// ```
/// use icu::locale::{extensions::unicode::Value, subtags::subtag};
///
/// let mut v = Value::default();
/// assert_eq!(v.subtag_count(), 0);
/// v.push_subtag(subtag!("foo"));
/// assert_eq!(v.subtag_count(), 1);
/// ```
pub fn subtag_count(&self) -> usize {
self.0.len()
}
/// Creates an empty [`Value`], which corresponds to a "true" value.
///
/// # Examples
///
/// ```
/// use icu::locale::extensions::unicode::{value, Value};
///
/// assert_eq!(value!("true"), Value::new_empty());
/// ```
pub const fn new_empty() -> Self {
Self(ShortBoxSlice::new())
}
/// Returns `true` if the Value has no subtags.
///
/// # Examples
///
/// ```
/// use icu::locale::{extensions::unicode::Value, subtags::subtag};
///
/// let mut v = Value::default();
/// assert!(v.is_empty());
/// ```
pub fn is_empty(&self) -> bool {
self.0.is_empty()
}
/// Removes and returns the subtag at position `index` within the value,
/// shifting all subtags after it to the left.
///
/// # Examples
///
/// ```
/// use icu::locale::{extensions::unicode::Value, subtags::subtag};
/// let mut v = Value::default();
/// v.push_subtag(subtag!("foo"));
/// v.push_subtag(subtag!("bar"));
/// v.push_subtag(subtag!("baz"));
///
/// assert_eq!(v.remove_subtag(1), Some(subtag!("bar")));
/// assert_eq!(v, "foo-baz");
/// ```
pub fn remove_subtag(&mut self, idx: usize) -> Option<Subtag> {
if self.0.len() < idx {
None
} else {
let item = self.0.remove(idx);
Some(item)
}
}
/// Returns a reference to a subtag at index.
///
/// # Examples
///
/// ```
/// use icu::locale::{extensions::unicode::Value, subtags::subtag};
/// let mut v = Value::default();
/// v.push_subtag(subtag!("foo"));
/// v.push_subtag(subtag!("bar"));
/// v.push_subtag(subtag!("baz"));
///
/// assert_eq!(v.get_subtag(1), Some(&subtag!("bar")));
/// assert_eq!(v.get_subtag(3), None);
/// ```
pub fn get_subtag(&self, idx: usize) -> Option<&Subtag> {
self.0.get(idx)
}
#[doc(hidden)]
pub const fn from_subtag(subtag: Option<Subtag>) -> Self {
match subtag {
None | Some(TRUE_VALUE) => Self(ShortBoxSlice::new()),
Some(val) => Self(ShortBoxSlice::new_single(val)),
}
}
#[doc(hidden)]
pub fn from_two_subtags(f: Subtag, s: Subtag) -> Self {
Self(ShortBoxSlice::new_double(f, s))
}
/// A constructor which takes a pre-sorted list of [`Value`] elements.
///
/// ✨ *Enabled with the `alloc` Cargo feature.*
///
/// # Examples
///
/// ```
/// use icu::locale::extensions::unicode::Value;
/// use icu::locale::subtags::subtag;
///
/// let subtag1 = subtag!("foobar");
/// let subtag2 = subtag!("testing");
/// let mut v = vec![subtag1, subtag2];
/// v.sort();
/// v.dedup();
///
/// let value = Value::from_vec_unchecked(v);
/// ```
///
/// Notice: For performance- and memory-constrained environments, it is recommended
/// for the caller to use [`binary_search`](slice::binary_search) instead of [`sort`](slice::sort)
/// and [`dedup`](Vec::dedup()).
#[cfg(feature = "alloc")]
pub fn from_vec_unchecked(input: Vec<Subtag>) -> Self {
Self(input.into())
}
#[allow(dead_code)]
pub(crate) fn from_short_slice_unchecked(input: ShortBoxSlice<Subtag>) -> Self {
Self(input)
}
pub(crate) const fn parse_subtag_from_utf8(t: &[u8]) -> Result<Option<Subtag>, ParseError> {
match Subtag::try_from_utf8(t) {
Ok(TRUE_VALUE) => Ok(None),
Ok(s) => Ok(Some(s)),
Err(_) => Err(ParseError::InvalidSubtag),
}
}
pub(crate) fn for_each_subtag_str<E, F>(&self, f: &mut F) -> Result<(), E>
where
F: FnMut(&str) -> Result<(), E>,
{
self.0.iter().map(Subtag::as_str).try_for_each(f)
}
}
impl IntoIterator for Value {
type Item = Subtag;
type IntoIter = ShortBoxSliceIntoIter<Subtag>;
fn into_iter(self) -> Self::IntoIter {
self.0.into_iter()
}
}
/// ✨ *Enabled with the `alloc` Cargo feature.*
#[cfg(feature = "alloc")]
impl FromIterator<Subtag> for Value {
fn from_iter<T: IntoIterator<Item = Subtag>>(iter: T) -> Self {
Self(ShortBoxSlice::from_iter(iter))
}
}
/// ✨ *Enabled with the `alloc` Cargo feature.*
#[cfg(feature = "alloc")]
impl Extend<Subtag> for Value {
fn extend<T: IntoIterator<Item = Subtag>>(&mut self, iter: T) {
for i in iter {
self.0.push(i);
}
}
}
/// ✨ *Enabled with the `alloc` Cargo feature.*
#[cfg(feature = "alloc")]
impl FromStr for Value {
type Err = ParseError;
#[inline]
fn from_str(s: &str) -> Result<Self, Self::Err> {
Self::try_from_str(s)
}
}
impl PartialEq<&str> for Value {
fn eq(&self, other: &&str) -> bool {
writeable::cmp_utf8(self, other.as_bytes()).is_eq()
}
}
impl_writeable_for_subtag_list!(Value, "islamic", "civil");
/// A macro allowing for compile-time construction of valid Unicode [`Value`] subtag.
///
/// The macro only supports single-subtag values.
///
/// # Examples
///
/// ```
/// use icu::locale::extensions::unicode::{key, value};
/// use icu::locale::Locale;
///
/// let loc: Locale = "de-u-ca-buddhist".parse().unwrap();
///
/// assert_eq!(
/// loc.extensions.unicode.keywords.get(&key!("ca")),
/// Some(&value!("buddhist"))
/// );
/// ```
///
/// [`Value`]: crate::extensions::unicode::Value
#[macro_export]
#[doc(hidden)] // macro
macro_rules! extensions_unicode_value {
($value:literal) => {
const {
$crate::extensions::unicode::Value::from_subtag(
match $crate::subtags::Subtag::try_from_utf8($value.as_bytes()) {
Ok(r) => Some(r),
_ => panic!(concat!("Invalid Unicode extension value: ", $value)),
},
)
}
};
}
#[doc(inline)]
pub use extensions_unicode_value as value;

427
vendor/icu_locale_core/src/helpers.rs vendored Normal file
View File

@@ -0,0 +1,427 @@
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
macro_rules! impl_tinystr_subtag {
(
$(#[$doc:meta])*
$name:ident,
$($path:ident)::+,
$macro_name:ident,
$internal_macro_name:ident,
$len_start:literal..=$len_end:literal,
$tinystr_ident:ident,
$validate:expr,
$normalize:expr,
$is_normalized:expr,
$error:ident,
[$good_example:literal $(,$more_good_examples:literal)*],
[$bad_example:literal $(, $more_bad_examples:literal)*],
) => {
#[derive(Debug, PartialEq, Eq, Clone, Hash, PartialOrd, Ord, Copy)]
#[repr(transparent)]
$(#[$doc])*
pub struct $name(tinystr::TinyAsciiStr<$len_end>);
impl $name {
/// A constructor which takes a str slice, parses it and
#[doc = concat!("produces a well-formed [`", stringify!($name), "`].")]
///
/// # Examples
///
/// ```
#[doc = concat!("use icu_locale_core::", stringify!($($path::)+), stringify!($name), ";")]
///
#[doc = concat!("assert!(", stringify!($name), "::try_from_str(", stringify!($good_example), ").is_ok());")]
#[doc = concat!("assert!(", stringify!($name), "::try_from_str(", stringify!($bad_example), ").is_err());")]
/// ```
#[inline]
pub const fn try_from_str(s: &str) -> Result<Self, crate::parser::errors::ParseError> {
Self::try_from_utf8(s.as_bytes())
}
/// See [`Self::try_from_str`]
pub const fn try_from_utf8(
code_units: &[u8],
) -> Result<Self, crate::parser::errors::ParseError> {
if code_units.len() < $len_start || code_units.len() > $len_end {
return Err(crate::parser::errors::ParseError::$error);
}
match tinystr::TinyAsciiStr::try_from_utf8(code_units) {
Ok($tinystr_ident) if $validate => Ok(Self($normalize)),
_ => Err(crate::parser::errors::ParseError::$error),
}
}
#[doc = concat!("Safely creates a [`", stringify!($name), "`] from its raw format")]
/// as returned by [`Self::into_raw`]. Unlike [`Self::try_from_utf8`],
/// this constructor only takes normalized values.
pub const fn try_from_raw(
raw: [u8; $len_end],
) -> Result<Self, crate::parser::errors::ParseError> {
if let Ok($tinystr_ident) = tinystr::TinyAsciiStr::<$len_end>::try_from_raw(raw) {
if $tinystr_ident.len() >= $len_start && $is_normalized {
Ok(Self($tinystr_ident))
} else {
Err(crate::parser::errors::ParseError::$error)
}
} else {
Err(crate::parser::errors::ParseError::$error)
}
}
#[doc = concat!("Unsafely creates a [`", stringify!($name), "`] from its raw format")]
/// as returned by [`Self::into_raw`]. Unlike [`Self::try_from_utf8`],
/// this constructor only takes normalized values.
///
/// # Safety
///
/// This function is safe iff [`Self::try_from_raw`] returns an `Ok`. This is the case
/// for inputs that are correctly normalized.
pub const unsafe fn from_raw_unchecked(v: [u8; $len_end]) -> Self {
Self(tinystr::TinyAsciiStr::from_utf8_unchecked(v))
}
/// Deconstructs into a raw format to be consumed by
/// [`from_raw_unchecked`](Self::from_raw_unchecked()) or
/// [`try_from_raw`](Self::try_from_raw()).
pub const fn into_raw(self) -> [u8; $len_end] {
*self.0.all_bytes()
}
#[inline]
/// A helper function for displaying as a `&str`.
pub const fn as_str(&self) -> &str {
self.0.as_str()
}
#[doc(hidden)]
pub const fn to_tinystr(&self) -> tinystr::TinyAsciiStr<$len_end> {
self.0
}
/// Compare with BCP-47 bytes.
///
/// The return value is equivalent to what would happen if you first converted
/// `self` to a BCP-47 string and then performed a byte comparison.
///
/// This function is case-sensitive and results in a *total order*, so it is appropriate for
/// binary search. The only argument producing [`Ordering::Equal`](core::cmp::Ordering::Equal)
/// is `self.as_str().as_bytes()`.
#[inline]
pub fn strict_cmp(self, other: &[u8]) -> core::cmp::Ordering {
self.as_str().as_bytes().cmp(other)
}
/// Compare with a potentially unnormalized BCP-47 string.
///
/// The return value is equivalent to what would happen if you first parsed the
/// BCP-47 string and then performed a structural comparison.
///
#[inline]
pub fn normalizing_eq(self, other: &str) -> bool {
self.as_str().eq_ignore_ascii_case(other)
}
}
impl core::str::FromStr for $name {
type Err = crate::parser::errors::ParseError;
#[inline]
fn from_str(s: &str) -> Result<Self, Self::Err> {
Self::try_from_str(s)
}
}
impl<'l> From<&'l $name> for &'l str {
fn from(input: &'l $name) -> Self {
input.as_str()
}
}
impl From<$name> for tinystr::TinyAsciiStr<$len_end> {
fn from(input: $name) -> Self {
input.to_tinystr()
}
}
impl writeable::Writeable for $name {
#[inline]
fn write_to<W: core::fmt::Write + ?Sized>(&self, sink: &mut W) -> core::fmt::Result {
sink.write_str(self.as_str())
}
#[inline]
fn writeable_length_hint(&self) -> writeable::LengthHint {
writeable::LengthHint::exact(self.0.len())
}
fn writeable_borrow(&self) -> Option<&str> {
Some(self.0.as_str())
}
}
writeable::impl_display_with_writeable!($name, #[cfg(feature = "alloc")]);
#[doc = concat!("A macro allowing for compile-time construction of valid [`", stringify!($name), "`] subtags.")]
///
/// # Examples
///
/// Parsing errors don't have to be handled at runtime:
/// ```
/// assert_eq!(
#[doc = concat!(" icu_locale_core::", $(stringify!($path), "::",)+ stringify!($macro_name), "!(", stringify!($good_example) ,"),")]
#[doc = concat!(" ", stringify!($good_example), ".parse::<icu_locale_core::", $(stringify!($path), "::",)+ stringify!($name), ">().unwrap()")]
/// );
/// ```
///
/// Invalid input is a compile failure:
/// ```compile_fail,E0080
#[doc = concat!("icu_locale_core::", $(stringify!($path), "::",)+ stringify!($macro_name), "!(", stringify!($bad_example) ,");")]
/// ```
///
#[doc = concat!("[`", stringify!($name), "`]: crate::", $(stringify!($path), "::",)+ stringify!($name))]
#[macro_export]
#[doc(hidden)] // macro
macro_rules! $internal_macro_name {
($string:literal) => { const {
use $crate::$($path ::)+ $name;
match $name::try_from_utf8($string.as_bytes()) {
Ok(r) => r,
_ => panic!(concat!("Invalid ", $(stringify!($path), "::",)+ stringify!($name), ": ", $string)),
}
}};
}
#[doc(inline)]
pub use $internal_macro_name as $macro_name;
#[cfg(feature = "databake")]
impl databake::Bake for $name {
fn bake(&self, env: &databake::CrateEnv) -> databake::TokenStream {
env.insert("icu_locale_core");
let string = self.as_str();
databake::quote! { icu_locale_core::$($path::)+ $macro_name!(#string) }
}
}
#[cfg(feature = "databake")]
impl databake::BakeSize for $name {
fn borrows_size(&self) -> usize {
0
}
}
#[test]
fn test_construction() {
let maybe = $name::try_from_utf8($good_example.as_bytes());
assert!(maybe.is_ok());
assert_eq!(maybe, $name::try_from_raw(maybe.unwrap().into_raw()));
assert_eq!(maybe.unwrap().as_str(), $good_example);
$(
let maybe = $name::try_from_utf8($more_good_examples.as_bytes());
assert!(maybe.is_ok());
assert_eq!(maybe, $name::try_from_raw(maybe.unwrap().into_raw()));
assert_eq!(maybe.unwrap().as_str(), $more_good_examples);
)*
assert!($name::try_from_utf8($bad_example.as_bytes()).is_err());
$(
assert!($name::try_from_utf8($more_bad_examples.as_bytes()).is_err());
)*
}
#[test]
fn test_writeable() {
writeable::assert_writeable_eq!(&$good_example.parse::<$name>().unwrap(), $good_example);
$(
writeable::assert_writeable_eq!($more_good_examples.parse::<$name>().unwrap(), $more_good_examples);
)*
}
#[cfg(feature = "serde")]
impl serde::Serialize for $name {
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
where
S: serde::Serializer,
{
self.0.serialize(serializer)
}
}
#[cfg(feature = "serde")]
impl<'de> serde::Deserialize<'de> for $name {
fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
where
D: serde::de::Deserializer<'de>,
{
struct Visitor;
impl<'de> serde::de::Visitor<'de> for Visitor {
type Value = $name;
fn expecting(
&self,
formatter: &mut core::fmt::Formatter<'_>,
) -> core::fmt::Result {
write!(formatter, "a valid BCP-47 {}", stringify!($name))
}
fn visit_str<E: serde::de::Error>(self, s: &str) -> Result<Self::Value, E> {
s.parse().map_err(serde::de::Error::custom)
}
}
if deserializer.is_human_readable() {
deserializer.deserialize_string(Visitor)
} else {
Self::try_from_raw(serde::de::Deserialize::deserialize(deserializer)?)
.map_err(serde::de::Error::custom)
}
}
}
// Safety checklist for ULE:
//
// 1. Must not include any uninitialized or padding bytes (true since transparent over a ULE).
// 2. Must have an alignment of 1 byte (true since transparent over a ULE).
// 3. ULE::validate_bytes() checks that the given byte slice represents a valid slice.
// 4. ULE::validate_bytes() checks that the given byte slice has a valid length.
// 5. All other methods must be left with their default impl.
// 6. Byte equality is semantic equality.
#[cfg(feature = "zerovec")]
unsafe impl zerovec::ule::ULE for $name {
fn validate_bytes(bytes: &[u8]) -> Result<(), zerovec::ule::UleError> {
let it = bytes.chunks_exact(core::mem::size_of::<Self>());
if !it.remainder().is_empty() {
return Err(zerovec::ule::UleError::length::<Self>(bytes.len()));
}
for v in it {
// The following can be removed once `array_chunks` is stabilized.
let mut a = [0; core::mem::size_of::<Self>()];
a.copy_from_slice(v);
if Self::try_from_raw(a).is_err() {
return Err(zerovec::ule::UleError::parse::<Self>());
}
}
Ok(())
}
}
#[cfg(feature = "zerovec")]
impl zerovec::ule::NicheBytes<$len_end> for $name {
const NICHE_BIT_PATTERN: [u8; $len_end] = <tinystr::TinyAsciiStr<$len_end>>::NICHE_BIT_PATTERN;
}
#[cfg(feature = "zerovec")]
impl zerovec::ule::AsULE for $name {
type ULE = Self;
fn to_unaligned(self) -> Self::ULE {
self
}
fn from_unaligned(unaligned: Self::ULE) -> Self {
unaligned
}
}
#[cfg(feature = "zerovec")]
#[cfg(feature = "alloc")]
impl<'a> zerovec::maps::ZeroMapKV<'a> for $name {
type Container = zerovec::ZeroVec<'a, $name>;
type Slice = zerovec::ZeroSlice<$name>;
type GetType = $name;
type OwnedType = $name;
}
};
}
#[macro_export]
#[doc(hidden)]
macro_rules! impl_writeable_for_each_subtag_str_no_test {
($type:tt $(, $self:ident, $borrow_cond:expr => $borrow:expr)?) => {
impl writeable::Writeable for $type {
fn write_to<W: core::fmt::Write + ?Sized>(&self, sink: &mut W) -> core::fmt::Result {
let mut initial = true;
self.for_each_subtag_str(&mut |subtag| {
if initial {
initial = false;
} else {
sink.write_char('-')?;
}
sink.write_str(subtag)
})
}
#[inline]
fn writeable_length_hint(&self) -> writeable::LengthHint {
let mut result = writeable::LengthHint::exact(0);
let mut initial = true;
self.for_each_subtag_str::<core::convert::Infallible, _>(&mut |subtag| {
if initial {
initial = false;
} else {
result += 1;
}
result += subtag.len();
Ok(())
})
.expect("infallible");
result
}
$(
fn writeable_borrow(&self) -> Option<&str> {
let $self = self;
if $borrow_cond {
$borrow
} else {
None
}
}
)?
}
writeable::impl_display_with_writeable!($type, #[cfg(feature = "alloc")]);
};
}
macro_rules! impl_writeable_for_subtag_list {
($type:tt, $sample1:literal, $sample2:literal) => {
impl_writeable_for_each_subtag_str_no_test!($type, selff, selff.0.len() == 1 => #[allow(clippy::unwrap_used)] { Some(selff.0.get(0).unwrap().as_str()) } );
#[test]
fn test_writeable() {
writeable::assert_writeable_eq!(&$type::default(), "");
writeable::assert_writeable_eq!(
&$type::from_vec_unchecked(alloc::vec![$sample1.parse().unwrap()]),
$sample1,
);
writeable::assert_writeable_eq!(
&$type::from_vec_unchecked(vec![
$sample1.parse().unwrap(),
$sample2.parse().unwrap()
]),
core::concat!($sample1, "-", $sample2),
);
}
};
}
macro_rules! impl_writeable_for_key_value {
($type:tt, $key1:literal, $value1:literal, $key2:literal, $expected2:literal) => {
impl_writeable_for_each_subtag_str_no_test!($type);
#[test]
fn test_writeable() {
writeable::assert_writeable_eq!(&$type::default(), "");
writeable::assert_writeable_eq!(
&$type::from_tuple_vec(vec![($key1.parse().unwrap(), $value1.parse().unwrap())]),
core::concat!($key1, "-", $value1),
);
writeable::assert_writeable_eq!(
&$type::from_tuple_vec(vec![
($key1.parse().unwrap(), $value1.parse().unwrap()),
($key2.parse().unwrap(), "true".parse().unwrap())
]),
core::concat!($key1, "-", $value1, "-", $expected2),
);
}
};
}

681
vendor/icu_locale_core/src/langid.rs vendored Normal file
View File

@@ -0,0 +1,681 @@
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
use core::cmp::Ordering;
#[cfg(feature = "alloc")]
use core::str::FromStr;
use crate::parser;
use crate::subtags;
use crate::ParseError;
#[cfg(feature = "alloc")]
use alloc::borrow::Cow;
/// A core struct representing a [`Unicode BCP47 Language Identifier`].
///
/// # Ordering
///
/// This type deliberately does not implement `Ord` or `PartialOrd` because there are
/// multiple possible orderings. Depending on your use case, two orderings are available:
///
/// 1. A string ordering, suitable for stable serialization: [`LanguageIdentifier::strict_cmp`]
/// 2. A struct ordering, suitable for use with a BTreeSet: [`LanguageIdentifier::total_cmp`]
///
/// See issue: <https://github.com/unicode-org/icu4x/issues/1215>
///
/// # Parsing
///
/// Unicode recognizes three levels of standard conformance for any language identifier:
///
/// * *well-formed* - syntactically correct
/// * *valid* - well-formed and only uses registered language, region, script and variant subtags...
/// * *canonical* - valid and no deprecated codes or structure.
///
/// At the moment parsing normalizes a well-formed language identifier converting
/// `_` separators to `-` and adjusting casing to conform to the Unicode standard.
///
/// Any syntactically invalid subtags will cause the parsing to fail with an error.
///
/// This operation normalizes syntax to be well-formed. No legacy subtag replacements is performed.
/// For validation and canonicalization, see `LocaleCanonicalizer`.
///
/// # Serde
///
/// This type implements `serde::Serialize` and `serde::Deserialize` if the
/// `"serde"` Cargo feature is enabled on the crate.
///
/// The value will be serialized as a string and parsed when deserialized.
/// For tips on efficient storage and retrieval of locales, see [`crate::zerovec`].
///
/// # Examples
///
/// Simple example:
///
/// ```
/// use icu::locale::{
/// langid,
/// subtags::{language, region},
/// };
///
/// let li = langid!("en-US");
///
/// assert_eq!(li.language, language!("en"));
/// assert_eq!(li.script, None);
/// assert_eq!(li.region, Some(region!("US")));
/// assert_eq!(li.variants.len(), 0);
/// ```
///
/// More complex example:
///
/// ```
/// use icu::locale::{
/// langid,
/// subtags::{language, region, script, variant},
/// };
///
/// let li = langid!("eN-latn-Us-Valencia");
///
/// assert_eq!(li.language, language!("en"));
/// assert_eq!(li.script, Some(script!("Latn")));
/// assert_eq!(li.region, Some(region!("US")));
/// assert_eq!(li.variants.first(), Some(&variant!("valencia")));
/// ```
///
/// [`Unicode BCP47 Language Identifier`]: https://unicode.org/reports/tr35/tr35.html#Unicode_language_identifier
#[derive(PartialEq, Eq, Clone, Hash)] // no Ord or PartialOrd: see docs
#[allow(clippy::exhaustive_structs)] // This struct is stable (and invoked by a macro)
pub struct LanguageIdentifier {
/// Language subtag of the language identifier.
pub language: subtags::Language,
/// Script subtag of the language identifier.
pub script: Option<subtags::Script>,
/// Region subtag of the language identifier.
pub region: Option<subtags::Region>,
/// Variant subtags of the language identifier.
pub variants: subtags::Variants,
}
impl LanguageIdentifier {
/// The unknown language identifier "und".
pub const UNKNOWN: Self = crate::langid!("und");
/// A constructor which takes a utf8 slice, parses it and
/// produces a well-formed [`LanguageIdentifier`].
///
/// ✨ *Enabled with the `alloc` Cargo feature.*
///
/// # Examples
///
/// ```
/// use icu::locale::LanguageIdentifier;
///
/// LanguageIdentifier::try_from_str("en-US").expect("Parsing failed");
/// ```
#[inline]
#[cfg(feature = "alloc")]
pub fn try_from_str(s: &str) -> Result<Self, ParseError> {
Self::try_from_utf8(s.as_bytes())
}
/// See [`Self::try_from_str`]
///
/// ✨ *Enabled with the `alloc` Cargo feature.*
#[cfg(feature = "alloc")]
pub fn try_from_utf8(code_units: &[u8]) -> Result<Self, ParseError> {
crate::parser::parse_language_identifier(code_units, parser::ParserMode::LanguageIdentifier)
}
#[doc(hidden)] // macro use
#[expect(clippy::type_complexity)]
// The return type should be `Result<Self, ParseError>` once the `const_precise_live_drops`
// is stabilized ([rust-lang#73255](https://github.com/rust-lang/rust/issues/73255)).
pub const fn try_from_utf8_with_single_variant(
code_units: &[u8],
) -> Result<
(
subtags::Language,
Option<subtags::Script>,
Option<subtags::Region>,
Option<subtags::Variant>,
),
ParseError,
> {
crate::parser::parse_language_identifier_with_single_variant(
code_units,
parser::ParserMode::LanguageIdentifier,
)
}
/// A constructor which takes a utf8 slice which may contain extension keys,
/// parses it and produces a well-formed [`LanguageIdentifier`].
///
/// ✨ *Enabled with the `alloc` Cargo feature.*
///
/// # Examples
///
/// ```
/// use icu::locale::{langid, LanguageIdentifier};
///
/// let li = LanguageIdentifier::try_from_locale_bytes(b"en-US-x-posix")
/// .expect("Parsing failed.");
///
/// assert_eq!(li, langid!("en-US"));
/// ```
///
/// This method should be used for input that may be a locale identifier.
/// All extensions will be lost.
#[cfg(feature = "alloc")]
pub fn try_from_locale_bytes(v: &[u8]) -> Result<Self, ParseError> {
parser::parse_language_identifier(v, parser::ParserMode::Locale)
}
/// Whether this [`LanguageIdentifier`] equals [`LanguageIdentifier::UNKNOWN`].
pub const fn is_unknown(&self) -> bool {
self.language.is_unknown()
&& self.script.is_none()
&& self.region.is_none()
&& self.variants.is_empty()
}
/// Normalize the language identifier (operating on UTF-8 formatted byte slices)
///
/// This operation will normalize casing and the separator.
///
/// ✨ *Enabled with the `alloc` Cargo feature.*
///
/// # Examples
///
/// ```
/// use icu::locale::LanguageIdentifier;
///
/// assert_eq!(
/// LanguageIdentifier::normalize("pL-latn-pl").as_deref(),
/// Ok("pl-Latn-PL")
/// );
/// ```
#[cfg(feature = "alloc")]
pub fn normalize_utf8(input: &[u8]) -> Result<Cow<'_, str>, ParseError> {
let lang_id = Self::try_from_utf8(input)?;
Ok(writeable::to_string_or_borrow(&lang_id, input))
}
/// Normalize the language identifier (operating on strings)
///
/// This operation will normalize casing and the separator.
///
/// ✨ *Enabled with the `alloc` Cargo feature.*
///
/// # Examples
///
/// ```
/// use icu::locale::LanguageIdentifier;
///
/// assert_eq!(
/// LanguageIdentifier::normalize("pL-latn-pl").as_deref(),
/// Ok("pl-Latn-PL")
/// );
/// ```
#[cfg(feature = "alloc")]
pub fn normalize(input: &str) -> Result<Cow<'_, str>, ParseError> {
Self::normalize_utf8(input.as_bytes())
}
/// Compare this [`LanguageIdentifier`] with BCP-47 bytes.
///
/// The return value is equivalent to what would happen if you first converted this
/// [`LanguageIdentifier`] to a BCP-47 string and then performed a byte comparison.
///
/// This function is case-sensitive and results in a *total order*, so it is appropriate for
/// binary search. The only argument producing [`Ordering::Equal`] is `self.to_string()`.
///
/// # Examples
///
/// Sorting a list of langids with this method requires converting one of them to a string:
///
/// ```
/// use icu::locale::LanguageIdentifier;
/// use std::cmp::Ordering;
/// use writeable::Writeable;
///
/// // Random input order:
/// let bcp47_strings: &[&str] = &[
/// "ar-Latn",
/// "zh-Hant-TW",
/// "zh-TW",
/// "und-fonipa",
/// "zh-Hant",
/// "ar-SA",
/// ];
///
/// let mut langids = bcp47_strings
/// .iter()
/// .map(|s| s.parse().unwrap())
/// .collect::<Vec<LanguageIdentifier>>();
/// langids.sort_by(|a, b| {
/// let b = b.write_to_string();
/// a.strict_cmp(b.as_bytes())
/// });
/// let strict_cmp_strings = langids
/// .iter()
/// .map(|l| l.to_string())
/// .collect::<Vec<String>>();
///
/// // Output ordering, sorted alphabetically
/// let expected_ordering: &[&str] = &[
/// "ar-Latn",
/// "ar-SA",
/// "und-fonipa",
/// "zh-Hant",
/// "zh-Hant-TW",
/// "zh-TW",
/// ];
///
/// assert_eq!(expected_ordering, strict_cmp_strings);
/// ```
pub fn strict_cmp(&self, other: &[u8]) -> Ordering {
writeable::cmp_utf8(self, other)
}
pub(crate) fn as_tuple(
&self,
) -> (
subtags::Language,
Option<subtags::Script>,
Option<subtags::Region>,
&subtags::Variants,
) {
(self.language, self.script, self.region, &self.variants)
}
/// Compare this [`LanguageIdentifier`] with another [`LanguageIdentifier`] field-by-field.
/// The result is a total ordering sufficient for use in a [`BTreeSet`].
///
/// Unlike [`LanguageIdentifier::strict_cmp`], the ordering may or may not be equivalent
/// to string ordering, and it may or may not be stable across ICU4X releases.
///
/// # Examples
///
/// This method returns a nonsensical ordering derived from the fields of the struct:
///
/// ```
/// use icu::locale::LanguageIdentifier;
/// use std::cmp::Ordering;
///
/// // Input strings, sorted alphabetically
/// let bcp47_strings: &[&str] = &[
/// "ar-Latn",
/// "ar-SA",
/// "und-fonipa",
/// "zh-Hant",
/// "zh-Hant-TW",
/// "zh-TW",
/// ];
/// assert!(bcp47_strings.windows(2).all(|w| w[0] < w[1]));
///
/// let mut langids = bcp47_strings
/// .iter()
/// .map(|s| s.parse().unwrap())
/// .collect::<Vec<LanguageIdentifier>>();
/// langids.sort_by(LanguageIdentifier::total_cmp);
/// let total_cmp_strings = langids
/// .iter()
/// .map(|l| l.to_string())
/// .collect::<Vec<String>>();
///
/// // Output ordering, sorted arbitrarily
/// let expected_ordering: &[&str] = &[
/// "ar-SA",
/// "ar-Latn",
/// "und-fonipa",
/// "zh-TW",
/// "zh-Hant",
/// "zh-Hant-TW",
/// ];
///
/// assert_eq!(expected_ordering, total_cmp_strings);
/// ```
///
/// Use a wrapper to add a [`LanguageIdentifier`] to a [`BTreeSet`]:
///
/// ```no_run
/// use icu::locale::LanguageIdentifier;
/// use std::cmp::Ordering;
/// use std::collections::BTreeSet;
///
/// #[derive(PartialEq, Eq)]
/// struct LanguageIdentifierTotalOrd(LanguageIdentifier);
///
/// impl Ord for LanguageIdentifierTotalOrd {
/// fn cmp(&self, other: &Self) -> Ordering {
/// self.0.total_cmp(&other.0)
/// }
/// }
///
/// impl PartialOrd for LanguageIdentifierTotalOrd {
/// fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
/// Some(self.cmp(other))
/// }
/// }
///
/// let _: BTreeSet<LanguageIdentifierTotalOrd> = unimplemented!();
/// ```
///
/// [`BTreeSet`]: alloc::collections::BTreeSet
pub fn total_cmp(&self, other: &Self) -> Ordering {
self.as_tuple().cmp(&other.as_tuple())
}
/// Compare this `LanguageIdentifier` with a potentially unnormalized BCP-47 string.
///
/// The return value is equivalent to what would happen if you first parsed the
/// BCP-47 string to a `LanguageIdentifier` and then performed a structural comparison.
///
/// # Examples
///
/// ```
/// use icu::locale::LanguageIdentifier;
///
/// let bcp47_strings: &[&str] = &[
/// "pl-LaTn-pL",
/// "uNd",
/// "UnD-adlm",
/// "uNd-GB",
/// "UND-FONIPA",
/// "ZH",
/// ];
///
/// for a in bcp47_strings {
/// assert!(a.parse::<LanguageIdentifier>().unwrap().normalizing_eq(a));
/// }
/// ```
pub fn normalizing_eq(&self, other: &str) -> bool {
macro_rules! subtag_matches {
($T:ty, $iter:ident, $expected:expr) => {
$iter
.next()
.map(|b| <$T>::try_from_utf8(b) == Ok($expected))
.unwrap_or(false)
};
}
let mut iter = parser::SubtagIterator::new(other.as_bytes());
if !subtag_matches!(subtags::Language, iter, self.language) {
return false;
}
if let Some(ref script) = self.script {
if !subtag_matches!(subtags::Script, iter, *script) {
return false;
}
}
if let Some(ref region) = self.region {
if !subtag_matches!(subtags::Region, iter, *region) {
return false;
}
}
for variant in self.variants.iter() {
if !subtag_matches!(subtags::Variant, iter, *variant) {
return false;
}
}
iter.next().is_none()
}
pub(crate) fn for_each_subtag_str<E, F>(&self, f: &mut F) -> Result<(), E>
where
F: FnMut(&str) -> Result<(), E>,
{
f(self.language.as_str())?;
if let Some(ref script) = self.script {
f(script.as_str())?;
}
if let Some(ref region) = self.region {
f(region.as_str())?;
}
for variant in self.variants.iter() {
f(variant.as_str())?;
}
Ok(())
}
/// Executes `f` on each subtag string of this `LanguageIdentifier`, with every string in
/// lowercase ascii form.
///
/// The default normalization of language identifiers uses titlecase scripts and uppercase
/// regions. However, this differs from [RFC6497 (BCP 47 Extension T)], which specifies:
///
/// > _The canonical form for all subtags in the extension is lowercase, with the fields
/// > ordered by the separators, alphabetically._
///
/// Hence, this method is used inside [`Transform Extensions`] to be able to get the correct
/// normalization of the language identifier.
///
/// As an example, the canonical form of locale **EN-LATN-CA-T-EN-LATN-CA** is
/// **en-Latn-CA-t-en-latn-ca**, with the script and region parts lowercased inside T extensions,
/// but titlecased and uppercased outside T extensions respectively.
///
/// [RFC6497 (BCP 47 Extension T)]: https://www.ietf.org/rfc/rfc6497.txt
/// [`Transform extensions`]: crate::extensions::transform
pub(crate) fn for_each_subtag_str_lowercased<E, F>(&self, f: &mut F) -> Result<(), E>
where
F: FnMut(&str) -> Result<(), E>,
{
f(self.language.as_str())?;
if let Some(ref script) = self.script {
f(script.to_tinystr().to_ascii_lowercase().as_str())?;
}
if let Some(ref region) = self.region {
f(region.to_tinystr().to_ascii_lowercase().as_str())?;
}
for variant in self.variants.iter() {
f(variant.as_str())?;
}
Ok(())
}
/// Writes this `LanguageIdentifier` to a sink, replacing uppercase ascii chars with
/// lowercase ascii chars.
///
/// The default normalization of language identifiers uses titlecase scripts and uppercase
/// regions. However, this differs from [RFC6497 (BCP 47 Extension T)], which specifies:
///
/// > _The canonical form for all subtags in the extension is lowercase, with the fields
/// > ordered by the separators, alphabetically._
///
/// Hence, this method is used inside [`Transform Extensions`] to be able to get the correct
/// normalization of the language identifier.
///
/// As an example, the canonical form of locale **EN-LATN-CA-T-EN-LATN-CA** is
/// **en-Latn-CA-t-en-latn-ca**, with the script and region parts lowercased inside T extensions,
/// but titlecased and uppercased outside T extensions respectively.
///
/// [RFC6497 (BCP 47 Extension T)]: https://www.ietf.org/rfc/rfc6497.txt
/// [`Transform extensions`]: crate::extensions::transform
pub(crate) fn write_lowercased_to<W: core::fmt::Write + ?Sized>(
&self,
sink: &mut W,
) -> core::fmt::Result {
let mut initial = true;
self.for_each_subtag_str_lowercased(&mut |subtag| {
if initial {
initial = false;
} else {
sink.write_char('-')?;
}
sink.write_str(subtag)
})
}
}
impl core::fmt::Debug for LanguageIdentifier {
fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
core::fmt::Display::fmt(&self, f)
}
}
/// ✨ *Enabled with the `alloc` Cargo feature.*
#[cfg(feature = "alloc")]
impl FromStr for LanguageIdentifier {
type Err = ParseError;
#[inline]
fn from_str(s: &str) -> Result<Self, Self::Err> {
Self::try_from_str(s)
}
}
impl_writeable_for_each_subtag_str_no_test!(LanguageIdentifier, selff, selff.script.is_none() && selff.region.is_none() && selff.variants.is_empty() => Some(selff.language.as_str()));
#[test]
fn test_writeable() {
use writeable::assert_writeable_eq;
assert_writeable_eq!(LanguageIdentifier::UNKNOWN, "und");
assert_writeable_eq!("und-001".parse::<LanguageIdentifier>().unwrap(), "und-001");
assert_writeable_eq!(
"und-Mymr".parse::<LanguageIdentifier>().unwrap(),
"und-Mymr",
);
assert_writeable_eq!(
"my-Mymr-MM".parse::<LanguageIdentifier>().unwrap(),
"my-Mymr-MM",
);
assert_writeable_eq!(
"my-Mymr-MM-posix".parse::<LanguageIdentifier>().unwrap(),
"my-Mymr-MM-posix",
);
assert_writeable_eq!(
"zh-macos-posix".parse::<LanguageIdentifier>().unwrap(),
"zh-macos-posix",
);
}
/// # Examples
///
/// ```
/// use icu::locale::{langid, subtags::language, LanguageIdentifier};
///
/// assert_eq!(LanguageIdentifier::from(language!("en")), langid!("en"));
/// ```
impl From<subtags::Language> for LanguageIdentifier {
fn from(language: subtags::Language) -> Self {
Self {
language,
script: None,
region: None,
variants: subtags::Variants::new(),
}
}
}
/// # Examples
///
/// ```
/// use icu::locale::{langid, subtags::script, LanguageIdentifier};
///
/// assert_eq!(
/// LanguageIdentifier::from(Some(script!("latn"))),
/// langid!("und-Latn")
/// );
/// ```
impl From<Option<subtags::Script>> for LanguageIdentifier {
fn from(script: Option<subtags::Script>) -> Self {
Self {
language: subtags::Language::UNKNOWN,
script,
region: None,
variants: subtags::Variants::new(),
}
}
}
/// # Examples
///
/// ```
/// use icu::locale::{langid, subtags::region, LanguageIdentifier};
///
/// assert_eq!(
/// LanguageIdentifier::from(Some(region!("US"))),
/// langid!("und-US")
/// );
/// ```
impl From<Option<subtags::Region>> for LanguageIdentifier {
fn from(region: Option<subtags::Region>) -> Self {
Self {
language: subtags::Language::UNKNOWN,
script: None,
region,
variants: subtags::Variants::new(),
}
}
}
/// Convert from an LSR tuple to a [`LanguageIdentifier`].
///
/// # Examples
///
/// ```
/// use icu::locale::{
/// langid,
/// subtags::{language, region, script},
/// LanguageIdentifier,
/// };
///
/// let lang = language!("en");
/// let script = script!("Latn");
/// let region = region!("US");
/// assert_eq!(
/// LanguageIdentifier::from((lang, Some(script), Some(region))),
/// langid!("en-Latn-US")
/// );
/// ```
impl
From<(
subtags::Language,
Option<subtags::Script>,
Option<subtags::Region>,
)> for LanguageIdentifier
{
fn from(
lsr: (
subtags::Language,
Option<subtags::Script>,
Option<subtags::Region>,
),
) -> Self {
Self {
language: lsr.0,
script: lsr.1,
region: lsr.2,
variants: subtags::Variants::new(),
}
}
}
/// Convert from a [`LanguageIdentifier`] to an LSR tuple.
///
/// # Examples
///
/// ```
/// use icu::locale::{
/// langid,
/// subtags::{language, region, script},
/// };
///
/// let lid = langid!("en-Latn-US");
/// let (lang, script, region) = (&lid).into();
///
/// assert_eq!(lang, language!("en"));
/// assert_eq!(script, Some(script!("Latn")));
/// assert_eq!(region, Some(region!("US")));
/// ```
impl From<&LanguageIdentifier>
for (
subtags::Language,
Option<subtags::Script>,
Option<subtags::Region>,
)
{
fn from(langid: &LanguageIdentifier) -> Self {
(langid.language, langid.script, langid.region)
}
}

96
vendor/icu_locale_core/src/lib.rs vendored Normal file
View File

@@ -0,0 +1,96 @@
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
//! Parsing, manipulating, and serializing Unicode Language and Locale Identifiers.
//!
//! This module is published as its own crate ([`icu_locale_core`](https://docs.rs/icu_locale_core/latest/icu_locale_core/))
//! and as part of the [`icu`](https://docs.rs/icu/latest/icu/) crate. See the latter for more details on the ICU4X project.
//!
//! The module provides algorithms for parsing a string into a well-formed language or locale identifier
//! as defined by [`UTS #35: Unicode LDML 3. Unicode Language and Locale Identifiers`]. Additionally
//! the module provides [`preferences`] interface for operations on locale preferences and conversions
//! from and to locale unicode extensions.
//!
//! [`Locale`] is the most common structure to use for storing information about a language,
//! script, region, variants and extensions. In almost all cases, this struct should be used as the
//! base unit for all locale management operations.
//!
//! [`LanguageIdentifier`] is a strict subset of [`Locale`] which can be useful in a narrow range of
//! cases where [`Unicode Extensions`] are not relevant.
//!
//! If in doubt, use [`Locale`].
//!
//! # Examples
//!
//! ```
//! use icu::locale::Locale;
//! use icu::locale::{
//! locale,
//! subtags::{language, region},
//! };
//!
//! let mut loc: Locale = locale!("en-US");
//!
//! assert_eq!(loc.id.language, language!("en"));
//! assert_eq!(loc.id.script, None);
//! assert_eq!(loc.id.region, Some(region!("US")));
//! assert_eq!(loc.id.variants.len(), 0);
//!
//! loc.id.region = Some(region!("GB"));
//!
//! assert_eq!(loc, locale!("en-GB"));
//! ```
//!
//! For more details, see [`Locale`] and [`LanguageIdentifier`].
//!
//! [`UTS #35: Unicode LDML 3. Unicode Language and Locale Identifiers`]: https://unicode.org/reports/tr35/tr35.html#Unicode_Language_and_Locale_Identifiers
//! [`ICU4X`]: ../icu/index.html
//! [`Unicode Extensions`]: extensions
// https://github.com/unicode-org/icu4x/blob/main/documents/process/boilerplate.md#library-annotations
#![cfg_attr(not(any(test, doc)), no_std)]
#![cfg_attr(
not(test),
deny(
clippy::indexing_slicing,
clippy::unwrap_used,
clippy::expect_used,
clippy::panic,
clippy::exhaustive_structs,
clippy::exhaustive_enums,
clippy::trivially_copy_pass_by_ref,
missing_debug_implementations,
)
)]
#![warn(missing_docs)]
#[cfg(feature = "alloc")]
extern crate alloc;
#[macro_use]
mod helpers;
mod data;
mod langid;
mod locale;
mod macros;
mod parser;
mod shortvec;
pub use data::DataLocale;
pub use langid::LanguageIdentifier;
pub use locale::Locale;
pub use parser::ParseError;
pub mod extensions;
#[macro_use]
pub mod subtags;
pub mod preferences;
pub mod zerovec;
#[cfg(all(feature = "alloc", feature = "serde"))]
mod serde;
#[cfg(feature = "databake")]
mod databake;

626
vendor/icu_locale_core/src/locale.rs vendored Normal file
View File

@@ -0,0 +1,626 @@
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
use crate::parser::*;
use crate::subtags::Subtag;
use crate::{extensions, subtags, LanguageIdentifier};
#[cfg(feature = "alloc")]
use alloc::borrow::Cow;
use core::cmp::Ordering;
#[cfg(feature = "alloc")]
use core::str::FromStr;
/// A core struct representing a [`Unicode Locale Identifier`].
///
/// A locale is made of two parts:
/// * Unicode Language Identifier
/// * A set of Unicode Extensions
///
/// [`Locale`] exposes all of the same fields and methods as [`LanguageIdentifier`], and
/// on top of that is able to parse, manipulate and serialize unicode extension fields.
///
/// # Ordering
///
/// This type deliberately does not implement `Ord` or `PartialOrd` because there are
/// multiple possible orderings. Depending on your use case, two orderings are available:
///
/// 1. A string ordering, suitable for stable serialization: [`Locale::strict_cmp`]
/// 2. A struct ordering, suitable for use with a BTreeSet: [`Locale::total_cmp`]
///
/// See issue: <https://github.com/unicode-org/icu4x/issues/1215>
///
/// # Parsing
///
/// Unicode recognizes three levels of standard conformance for a locale:
///
/// * *well-formed* - syntactically correct
/// * *valid* - well-formed and only uses registered language subtags, extensions, keywords, types...
/// * *canonical* - valid and no deprecated codes or structure.
///
/// Any syntactically invalid subtags will cause the parsing to fail with an error.
///
/// This operation normalizes syntax to be well-formed. No legacy subtag replacements is performed.
/// For validation and canonicalization, see `LocaleCanonicalizer`.
///
/// ICU4X's Locale parsing does not allow for non-BCP-47-compatible locales [allowed by UTS 35 for backwards compatability][tr35-bcp].
/// Furthermore, it currently does not allow for language tags to have more than three characters.
///
/// # Serde
///
/// This type implements `serde::Serialize` and `serde::Deserialize` if the
/// `"serde"` Cargo feature is enabled on the crate.
///
/// The value will be serialized as a string and parsed when deserialized.
/// For tips on efficient storage and retrieval of locales, see [`crate::zerovec`].
///
/// # Examples
///
/// Simple example:
///
/// ```
/// use icu::locale::{
/// extensions::unicode::{key, value},
/// locale,
/// subtags::{language, region},
/// };
///
/// let loc = locale!("en-US-u-ca-buddhist");
///
/// assert_eq!(loc.id.language, language!("en"));
/// assert_eq!(loc.id.script, None);
/// assert_eq!(loc.id.region, Some(region!("US")));
/// assert_eq!(loc.id.variants.len(), 0);
/// assert_eq!(
/// loc.extensions.unicode.keywords.get(&key!("ca")),
/// Some(&value!("buddhist"))
/// );
/// ```
///
/// More complex example:
///
/// ```
/// use icu::locale::{subtags::*, Locale};
///
/// let loc: Locale = "eN-latn-Us-Valencia-u-hC-H12"
/// .parse()
/// .expect("Failed to parse.");
///
/// assert_eq!(loc.id.language, "en".parse::<Language>().unwrap());
/// assert_eq!(loc.id.script, "Latn".parse::<Script>().ok());
/// assert_eq!(loc.id.region, "US".parse::<Region>().ok());
/// assert_eq!(
/// loc.id.variants.first(),
/// "valencia".parse::<Variant>().ok().as_ref()
/// );
/// ```
///
/// [`Unicode Locale Identifier`]: https://unicode.org/reports/tr35/tr35.html#Unicode_locale_identifier
/// [tr35-bcp]: https://unicode.org/reports/tr35/#BCP_47_Conformance
#[derive(PartialEq, Eq, Clone, Hash)] // no Ord or PartialOrd: see docs
#[allow(clippy::exhaustive_structs)] // This struct is stable (and invoked by a macro)
pub struct Locale {
/// The basic language/script/region components in the locale identifier along with any variants.
pub id: LanguageIdentifier,
/// Any extensions present in the locale identifier.
pub extensions: extensions::Extensions,
}
#[test]
// Expected sizes are based on a 64-bit architecture
#[cfg(target_pointer_width = "64")]
fn test_sizes() {
assert_eq!(core::mem::size_of::<subtags::Language>(), 3);
assert_eq!(core::mem::size_of::<subtags::Script>(), 4);
assert_eq!(core::mem::size_of::<subtags::Region>(), 3);
assert_eq!(core::mem::size_of::<subtags::Variant>(), 8);
assert_eq!(core::mem::size_of::<subtags::Variants>(), 16);
assert_eq!(core::mem::size_of::<LanguageIdentifier>(), 32);
assert_eq!(core::mem::size_of::<extensions::transform::Transform>(), 56);
assert_eq!(core::mem::size_of::<Option<LanguageIdentifier>>(), 32);
assert_eq!(core::mem::size_of::<extensions::transform::Fields>(), 24);
assert_eq!(core::mem::size_of::<extensions::unicode::Attributes>(), 16);
assert_eq!(core::mem::size_of::<extensions::unicode::Keywords>(), 24);
assert_eq!(core::mem::size_of::<Vec<extensions::other::Other>>(), 24);
assert_eq!(core::mem::size_of::<extensions::private::Private>(), 16);
assert_eq!(core::mem::size_of::<extensions::Extensions>(), 136);
assert_eq!(core::mem::size_of::<Locale>(), 168);
}
impl Locale {
/// The unknown locale "und".
pub const UNKNOWN: Self = crate::locale!("und");
/// A constructor which takes a utf8 slice, parses it and
/// produces a well-formed [`Locale`].
///
/// ✨ *Enabled with the `alloc` Cargo feature.*
///
/// # Examples
///
/// ```
/// use icu::locale::Locale;
///
/// Locale::try_from_str("en-US-u-hc-h12").unwrap();
/// ```
#[inline]
#[cfg(feature = "alloc")]
pub fn try_from_str(s: &str) -> Result<Self, ParseError> {
Self::try_from_utf8(s.as_bytes())
}
/// See [`Self::try_from_str`]
///
/// ✨ *Enabled with the `alloc` Cargo feature.*
#[cfg(feature = "alloc")]
pub fn try_from_utf8(code_units: &[u8]) -> Result<Self, ParseError> {
parse_locale(code_units)
}
/// Normalize the locale (operating on UTF-8 formatted byte slices)
///
/// This operation will normalize casing and the separator.
///
/// ✨ *Enabled with the `alloc` Cargo feature.*
///
/// # Examples
///
/// ```
/// use icu::locale::Locale;
///
/// assert_eq!(
/// Locale::normalize_utf8(b"pL-latn-pl-U-HC-H12").as_deref(),
/// Ok("pl-Latn-PL-u-hc-h12")
/// );
/// ```
#[cfg(feature = "alloc")]
pub fn normalize_utf8(input: &[u8]) -> Result<Cow<'_, str>, ParseError> {
let locale = Self::try_from_utf8(input)?;
Ok(writeable::to_string_or_borrow(&locale, input))
}
/// Normalize the locale (operating on strings)
///
/// This operation will normalize casing and the separator.
///
/// ✨ *Enabled with the `alloc` Cargo feature.*
///
/// # Examples
///
/// ```
/// use icu::locale::Locale;
///
/// assert_eq!(
/// Locale::normalize("pL-latn-pl-U-HC-H12").as_deref(),
/// Ok("pl-Latn-PL-u-hc-h12")
/// );
/// ```
#[cfg(feature = "alloc")]
pub fn normalize(input: &str) -> Result<Cow<'_, str>, ParseError> {
Self::normalize_utf8(input.as_bytes())
}
/// Compare this [`Locale`] with BCP-47 bytes.
///
/// The return value is equivalent to what would happen if you first converted this
/// [`Locale`] to a BCP-47 string and then performed a byte comparison.
///
/// This function is case-sensitive and results in a *total order*, so it is appropriate for
/// binary search. The only argument producing [`Ordering::Equal`] is `self.to_string()`.
///
/// # Examples
///
/// Sorting a list of locales with this method requires converting one of them to a string:
///
/// ```
/// use icu::locale::Locale;
/// use std::cmp::Ordering;
/// use writeable::Writeable;
///
/// // Random input order:
/// let bcp47_strings: &[&str] = &[
/// "und-u-ca-hebrew",
/// "ar-Latn",
/// "zh-Hant-TW",
/// "zh-TW",
/// "und-fonipa",
/// "zh-Hant",
/// "ar-SA",
/// ];
///
/// let mut locales = bcp47_strings
/// .iter()
/// .map(|s| s.parse().unwrap())
/// .collect::<Vec<Locale>>();
/// locales.sort_by(|a, b| {
/// let b = b.write_to_string();
/// a.strict_cmp(b.as_bytes())
/// });
/// let strict_cmp_strings = locales
/// .iter()
/// .map(|l| l.to_string())
/// .collect::<Vec<String>>();
///
/// // Output ordering, sorted alphabetically
/// let expected_ordering: &[&str] = &[
/// "ar-Latn",
/// "ar-SA",
/// "und-fonipa",
/// "und-u-ca-hebrew",
/// "zh-Hant",
/// "zh-Hant-TW",
/// "zh-TW",
/// ];
///
/// assert_eq!(expected_ordering, strict_cmp_strings);
/// ```
pub fn strict_cmp(&self, other: &[u8]) -> Ordering {
writeable::cmp_utf8(self, other)
}
#[expect(clippy::type_complexity)]
pub(crate) fn as_tuple(
&self,
) -> (
(
subtags::Language,
Option<subtags::Script>,
Option<subtags::Region>,
&subtags::Variants,
),
(
(
&extensions::unicode::Attributes,
&extensions::unicode::Keywords,
),
(
Option<(
subtags::Language,
Option<subtags::Script>,
Option<subtags::Region>,
&subtags::Variants,
)>,
&extensions::transform::Fields,
),
&extensions::private::Private,
&[extensions::other::Other],
),
) {
(self.id.as_tuple(), self.extensions.as_tuple())
}
/// Returns an ordering suitable for use in [`BTreeSet`].
///
/// Unlike [`Locale::strict_cmp`], the ordering may or may not be equivalent
/// to string ordering, and it may or may not be stable across ICU4X releases.
///
/// # Examples
///
/// This method returns a nonsensical ordering derived from the fields of the struct:
///
/// ```
/// use icu::locale::Locale;
/// use std::cmp::Ordering;
///
/// // Input strings, sorted alphabetically
/// let bcp47_strings: &[&str] = &[
/// "ar-Latn",
/// "ar-SA",
/// "und-fonipa",
/// "und-u-ca-hebrew",
/// "zh-Hant",
/// "zh-Hant-TW",
/// "zh-TW",
/// ];
/// assert!(bcp47_strings.windows(2).all(|w| w[0] < w[1]));
///
/// let mut locales = bcp47_strings
/// .iter()
/// .map(|s| s.parse().unwrap())
/// .collect::<Vec<Locale>>();
/// locales.sort_by(Locale::total_cmp);
/// let total_cmp_strings = locales
/// .iter()
/// .map(|l| l.to_string())
/// .collect::<Vec<String>>();
///
/// // Output ordering, sorted arbitrarily
/// let expected_ordering: &[&str] = &[
/// "ar-SA",
/// "ar-Latn",
/// "und-u-ca-hebrew",
/// "und-fonipa",
/// "zh-TW",
/// "zh-Hant",
/// "zh-Hant-TW",
/// ];
///
/// assert_eq!(expected_ordering, total_cmp_strings);
/// ```
///
/// Use a wrapper to add a [`Locale`] to a [`BTreeSet`]:
///
/// ```no_run
/// use icu::locale::Locale;
/// use std::cmp::Ordering;
/// use std::collections::BTreeSet;
///
/// #[derive(PartialEq, Eq)]
/// struct LocaleTotalOrd(Locale);
///
/// impl Ord for LocaleTotalOrd {
/// fn cmp(&self, other: &Self) -> Ordering {
/// self.0.total_cmp(&other.0)
/// }
/// }
///
/// impl PartialOrd for LocaleTotalOrd {
/// fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
/// Some(self.cmp(other))
/// }
/// }
///
/// let _: BTreeSet<LocaleTotalOrd> = unimplemented!();
/// ```
///
/// [`BTreeSet`]: alloc::collections::BTreeSet
pub fn total_cmp(&self, other: &Self) -> Ordering {
self.as_tuple().cmp(&other.as_tuple())
}
/// Compare this `Locale` with a potentially unnormalized BCP-47 string.
///
/// The return value is equivalent to what would happen if you first parsed the
/// BCP-47 string to a `Locale` and then performed a structural comparison.
///
/// ✨ *Enabled with the `alloc` Cargo feature.*
///
/// # Examples
///
/// ```
/// use icu::locale::Locale;
///
/// let bcp47_strings: &[&str] = &[
/// "pl-LaTn-pL",
/// "uNd",
/// "UND-FONIPA",
/// "UnD-t-m0-TrUe",
/// "uNd-u-CA-Japanese",
/// "ZH",
/// ];
///
/// for a in bcp47_strings {
/// assert!(a.parse::<Locale>().unwrap().normalizing_eq(a));
/// }
/// ```
#[cfg(feature = "alloc")]
pub fn normalizing_eq(&self, other: &str) -> bool {
macro_rules! subtag_matches {
($T:ty, $iter:ident, $expected:expr) => {
$iter
.next()
.map(|b| <$T>::try_from_utf8(b) == Ok($expected))
.unwrap_or(false)
};
}
let mut iter = SubtagIterator::new(other.as_bytes());
if !subtag_matches!(subtags::Language, iter, self.id.language) {
return false;
}
if let Some(ref script) = self.id.script {
if !subtag_matches!(subtags::Script, iter, *script) {
return false;
}
}
if let Some(ref region) = self.id.region {
if !subtag_matches!(subtags::Region, iter, *region) {
return false;
}
}
for variant in self.id.variants.iter() {
if !subtag_matches!(subtags::Variant, iter, *variant) {
return false;
}
}
if !self.extensions.is_empty() {
match extensions::Extensions::try_from_iter(&mut iter) {
Ok(exts) => {
if self.extensions != exts {
return false;
}
}
Err(_) => {
return false;
}
}
}
iter.next().is_none()
}
#[doc(hidden)] // macro use
#[expect(clippy::type_complexity)]
pub const fn try_from_utf8_with_single_variant_single_keyword_unicode_extension(
code_units: &[u8],
) -> Result<
(
subtags::Language,
Option<subtags::Script>,
Option<subtags::Region>,
Option<subtags::Variant>,
Option<(extensions::unicode::Key, Option<Subtag>)>,
),
ParseError,
> {
parse_locale_with_single_variant_single_keyword_unicode_keyword_extension(
code_units,
ParserMode::Locale,
)
}
pub(crate) fn for_each_subtag_str<E, F>(&self, f: &mut F) -> Result<(), E>
where
F: FnMut(&str) -> Result<(), E>,
{
self.id.for_each_subtag_str(f)?;
self.extensions.for_each_subtag_str(f)?;
Ok(())
}
}
/// ✨ *Enabled with the `alloc` Cargo feature.*
#[cfg(feature = "alloc")]
impl FromStr for Locale {
type Err = ParseError;
#[inline]
fn from_str(s: &str) -> Result<Self, Self::Err> {
Self::try_from_str(s)
}
}
impl From<LanguageIdentifier> for Locale {
fn from(id: LanguageIdentifier) -> Self {
Self {
id,
extensions: extensions::Extensions::default(),
}
}
}
impl From<Locale> for LanguageIdentifier {
fn from(loc: Locale) -> Self {
loc.id
}
}
impl core::fmt::Debug for Locale {
fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
writeable::Writeable::write_to(self, f)
}
}
impl_writeable_for_each_subtag_str_no_test!(Locale, selff, selff.extensions.is_empty() => selff.id.writeable_borrow());
#[test]
fn test_writeable() {
use writeable::assert_writeable_eq;
assert_writeable_eq!(Locale::UNKNOWN, "und");
assert_writeable_eq!("und-001".parse::<Locale>().unwrap(), "und-001");
assert_writeable_eq!("und-Mymr".parse::<Locale>().unwrap(), "und-Mymr");
assert_writeable_eq!("my-Mymr-MM".parse::<Locale>().unwrap(), "my-Mymr-MM");
assert_writeable_eq!(
"my-Mymr-MM-posix".parse::<Locale>().unwrap(),
"my-Mymr-MM-posix",
);
assert_writeable_eq!(
"zh-macos-posix".parse::<Locale>().unwrap(),
"zh-macos-posix",
);
assert_writeable_eq!(
"my-t-my-d0-zawgyi".parse::<Locale>().unwrap(),
"my-t-my-d0-zawgyi",
);
assert_writeable_eq!(
"ar-SA-u-ca-islamic-civil".parse::<Locale>().unwrap(),
"ar-SA-u-ca-islamic-civil",
);
assert_writeable_eq!(
"en-001-x-foo-bar".parse::<Locale>().unwrap(),
"en-001-x-foo-bar",
);
assert_writeable_eq!("und-t-m0-true".parse::<Locale>().unwrap(), "und-t-m0-true",);
}
/// # Examples
///
/// ```
/// use icu::locale::Locale;
/// use icu::locale::{locale, subtags::language};
///
/// assert_eq!(Locale::from(language!("en")), locale!("en"));
/// ```
impl From<subtags::Language> for Locale {
fn from(language: subtags::Language) -> Self {
Self {
id: language.into(),
extensions: extensions::Extensions::new(),
}
}
}
/// # Examples
///
/// ```
/// use icu::locale::Locale;
/// use icu::locale::{locale, subtags::script};
///
/// assert_eq!(Locale::from(Some(script!("latn"))), locale!("und-Latn"));
/// ```
impl From<Option<subtags::Script>> for Locale {
fn from(script: Option<subtags::Script>) -> Self {
Self {
id: script.into(),
extensions: extensions::Extensions::new(),
}
}
}
/// # Examples
///
/// ```
/// use icu::locale::Locale;
/// use icu::locale::{locale, subtags::region};
///
/// assert_eq!(Locale::from(Some(region!("US"))), locale!("und-US"));
/// ```
impl From<Option<subtags::Region>> for Locale {
fn from(region: Option<subtags::Region>) -> Self {
Self {
id: region.into(),
extensions: extensions::Extensions::new(),
}
}
}
/// # Examples
///
/// ```
/// use icu::locale::Locale;
/// use icu::locale::{
/// locale,
/// subtags::{language, region, script},
/// };
///
/// assert_eq!(
/// Locale::from((
/// language!("en"),
/// Some(script!("Latn")),
/// Some(region!("US"))
/// )),
/// locale!("en-Latn-US")
/// );
/// ```
impl
From<(
subtags::Language,
Option<subtags::Script>,
Option<subtags::Region>,
)> for Locale
{
fn from(
lsr: (
subtags::Language,
Option<subtags::Script>,
Option<subtags::Region>,
),
) -> Self {
Self {
id: lsr.into(),
extensions: extensions::Extensions::new(),
}
}
}

185
vendor/icu_locale_core/src/macros.rs vendored Normal file
View File

@@ -0,0 +1,185 @@
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
/// A macro allowing for compile-time construction of valid [`LanguageIdentifier`]s.
///
/// The macro will perform syntax normalization of the tag.
///
/// # Examples
///
/// ```
/// use icu::locale::{langid, LanguageIdentifier};
///
/// const DE_AT: LanguageIdentifier = langid!("de-at");
///
/// let de_at: LanguageIdentifier = "de-at".parse().unwrap();
///
/// assert_eq!(DE_AT, de_at);
/// ```
///
/// *Note*: The macro cannot produce language identifiers with more than one variants due to const
/// limitations (see [`Heap Allocations in Constants`]):
///
/// ```compile_fail,E0080
/// icu::locale::langid!("und-variant1-variant2");
/// ```
///
/// Use runtime parsing instead:
/// ```
/// "und-variant1-variant2"
/// .parse::<icu::locale::LanguageIdentifier>()
/// .unwrap();
/// ```
///
/// [`LanguageIdentifier`]: crate::LanguageIdentifier
/// [`Heap Allocations in Constants`]: https://github.com/rust-lang/const-eval/issues/20
#[macro_export]
macro_rules! langid {
($langid:literal) => { const {
match $crate::LanguageIdentifier::try_from_utf8_with_single_variant($langid.as_bytes()) {
Ok((language, script, region, variant)) => $crate::LanguageIdentifier {
language,
script,
region,
variants: match variant {
Some(v) => $crate::subtags::Variants::from_variant(v),
None => $crate::subtags::Variants::new(),
}
},
_ => panic!(concat!("Invalid language code: ", $langid, " . Note langid! macro can only support up to a single variant tag. Use runtime parsing instead.")),
}
}};
}
/// A macro allowing for compile-time construction of valid [`Locale`]s.
///
/// The macro will perform syntax normalization of the tag.
///
/// # Examples
///
/// ```
/// use icu::locale::{locale, Locale};
///
/// const DE_AT: Locale = locale!("de-at");
///
/// let de_at: Locale = "de-at".parse().unwrap();
///
/// assert_eq!(DE_AT, de_at);
/// ```
///
/// *Note*: The macro cannot produce locales with more than one variant or multiple extensions
/// (only single keyword unicode extension is supported) due to const
/// limitations (see [`Heap Allocations in Constants`]):
///
/// ```compile_fail,E0080
/// icu::locale::locale!("sl-IT-rozaj-biske-1994");
/// ```
/// Use runtime parsing instead:
/// ```
/// "sl-IT-rozaj-biske-1994"
/// .parse::<icu::locale::Locale>()
/// .unwrap();
/// ```
///
/// Locales with multiple keys are not supported
/// ```compile_fail,E0080
/// icu::locale::locale!("th-TH-u-ca-buddhist-nu-thai");
/// ```
/// Use runtime parsing instead:
/// ```
/// "th-TH-u-ca-buddhist-nu-thai"
/// .parse::<icu::locale::Locale>()
/// .unwrap();
/// ```
///
/// Locales with attributes are not supported
/// ```compile_fail,E0080
/// icu::locale::locale!("en-US-u-foobar-ca-buddhist");
/// ```
/// Use runtime parsing instead:
/// ```
/// "en-US-u-foobar-ca-buddhist"
/// .parse::<icu::locale::Locale>()
/// .unwrap();
/// ```
///
/// Locales with single key but multiple types are not supported
/// ```compile_fail,E0080
/// icu::locale::locale!("en-US-u-ca-islamic-umalqura");
/// ```
/// Use runtime parsing instead:
/// ```
/// "en-US-u-ca-islamic-umalqura"
/// .parse::<icu::locale::Locale>()
/// .unwrap();
/// ```
/// [`Locale`]: crate::Locale
/// [`Heap Allocations in Constants`]: https://github.com/rust-lang/const-eval/issues/20
#[macro_export]
macro_rules! locale {
($locale:literal) => { const {
match $crate::Locale::try_from_utf8_with_single_variant_single_keyword_unicode_extension(
$locale.as_bytes(),
) {
Ok((language, script, region, variant, keyword)) => $crate::Locale {
id: $crate::LanguageIdentifier {
language,
script,
region,
variants: match variant {
Some(v) => $crate::subtags::Variants::from_variant(v),
None => $crate::subtags::Variants::new(),
},
},
extensions: match keyword {
Some(k) => $crate::extensions::Extensions::from_unicode(
$crate::extensions::unicode::Unicode {
keywords: $crate::extensions::unicode::Keywords::new_single(
k.0,
$crate::extensions::unicode::Value::from_subtag(k.1),
),
attributes: $crate::extensions::unicode::Attributes::new(),
},
),
None => $crate::extensions::Extensions::new(),
},
},
_ => panic!(concat!(
"Invalid language code: ",
$locale,
" . Note the locale! macro only supports up to one variant tag; \
and one unicode keyword, other extension are \
not supported. Use runtime parsing instead."
)),
}
}};
}
#[cfg(test)]
mod test {
use crate::LanguageIdentifier;
use crate::Locale;
#[test]
fn test_langid_macro_can_parse_langid_with_single_variant() {
const DE_AT_FOOBAR: LanguageIdentifier = langid!("de-at-foobar");
let de_at_foobar: LanguageIdentifier = "de-at-foobar".parse().unwrap();
assert_eq!(DE_AT_FOOBAR, de_at_foobar);
}
#[test]
fn test_locale_macro_can_parse_locale_with_single_variant() {
const DE_AT_FOOBAR: Locale = locale!("de-at-foobar");
let de_at_foobar: Locale = "de-at-foobar".parse().unwrap();
assert_eq!(DE_AT_FOOBAR, de_at_foobar);
}
#[test]
fn test_locale_macro_can_parse_locale_with_single_keyword_unicode_extension() {
const DE_AT_U_CA_FOOBAR: Locale = locale!("de-at-u-ca-foobar");
let de_at_u_ca_foobar: Locale = "de-at-u-ca-foobar".parse().unwrap();
assert_eq!(DE_AT_U_CA_FOOBAR, de_at_u_ca_foobar);
}
}

View File

@@ -0,0 +1,69 @@
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
use displaydoc::Display;
/// List of parser errors that can be generated
/// while parsing [`LanguageIdentifier`](crate::LanguageIdentifier), [`Locale`](crate::Locale),
/// [`subtags`](crate::subtags) or [`extensions`](crate::extensions).
#[derive(Display, Debug, PartialEq, Copy, Clone)]
#[non_exhaustive]
pub enum ParseError {
/// Invalid language subtag.
///
/// # Examples
///
/// ```
/// use icu::locale::subtags::Language;
/// use icu::locale::ParseError;
///
/// assert_eq!("x2".parse::<Language>(), Err(ParseError::InvalidLanguage));
/// ```
#[displaydoc("The given language subtag is invalid")]
InvalidLanguage,
/// Invalid script, region or variant subtag.
///
/// # Examples
///
/// ```
/// use icu::locale::subtags::Region;
/// use icu::locale::ParseError;
///
/// assert_eq!("#@2X".parse::<Region>(), Err(ParseError::InvalidSubtag));
/// ```
#[displaydoc("Invalid subtag")]
InvalidSubtag,
/// Invalid extension subtag.
///
/// # Examples
///
/// ```
/// use icu::locale::extensions::unicode::Key;
/// use icu::locale::ParseError;
///
/// assert_eq!("#@2X".parse::<Key>(), Err(ParseError::InvalidExtension));
/// ```
#[displaydoc("Invalid extension")]
InvalidExtension,
/// Duplicated extension.
///
/// # Examples
///
/// ```
/// use icu::locale::Locale;
/// use icu::locale::ParseError;
///
/// assert_eq!(
/// "und-u-hc-h12-u-ca-calendar".parse::<Locale>(),
/// Err(ParseError::DuplicatedExtension)
/// );
/// ```
#[displaydoc("Duplicated extension")]
DuplicatedExtension,
}
impl core::error::Error for ParseError {}

View File

@@ -0,0 +1,273 @@
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
pub use super::errors::ParseError;
use crate::extensions::unicode::{Attribute, Key, Value};
use crate::extensions::ExtensionType;
use crate::parser::SubtagIterator;
#[cfg(feature = "alloc")]
use crate::shortvec::ShortBoxSlice;
use crate::subtags::Subtag;
#[cfg(feature = "alloc")]
use crate::LanguageIdentifier;
use crate::{extensions, subtags};
#[derive(PartialEq, Clone, Copy)]
pub enum ParserMode {
LanguageIdentifier,
Locale,
#[allow(dead_code)]
Partial,
}
#[derive(PartialEq, Clone, Copy)]
enum ParserPosition {
Script,
Region,
Variant,
}
#[cfg(feature = "alloc")]
pub fn parse_language_identifier_from_iter(
iter: &mut SubtagIterator,
mode: ParserMode,
) -> Result<LanguageIdentifier, ParseError> {
let mut script = None;
let mut region = None;
let mut variants = ShortBoxSlice::new();
let language = if let Some(subtag) = iter.next() {
subtags::Language::try_from_utf8(subtag)?
} else {
return Err(ParseError::InvalidLanguage);
};
let mut position = ParserPosition::Script;
while let Some(subtag) = iter.peek() {
if mode != ParserMode::LanguageIdentifier && subtag.len() == 1 {
break;
}
if position == ParserPosition::Script {
if let Ok(s) = subtags::Script::try_from_utf8(subtag) {
script = Some(s);
position = ParserPosition::Region;
} else if let Ok(s) = subtags::Region::try_from_utf8(subtag) {
region = Some(s);
position = ParserPosition::Variant;
} else if let Ok(v) = subtags::Variant::try_from_utf8(subtag) {
if let Err(idx) = variants.binary_search(&v) {
variants.insert(idx, v);
}
position = ParserPosition::Variant;
} else if mode == ParserMode::Partial {
break;
} else {
return Err(ParseError::InvalidSubtag);
}
} else if position == ParserPosition::Region {
if let Ok(s) = subtags::Region::try_from_utf8(subtag) {
region = Some(s);
position = ParserPosition::Variant;
} else if let Ok(v) = subtags::Variant::try_from_utf8(subtag) {
if let Err(idx) = variants.binary_search(&v) {
variants.insert(idx, v);
}
position = ParserPosition::Variant;
} else if mode == ParserMode::Partial {
break;
} else {
return Err(ParseError::InvalidSubtag);
}
} else if let Ok(v) = subtags::Variant::try_from_utf8(subtag) {
if let Err(idx) = variants.binary_search(&v) {
variants.insert(idx, v);
} else {
return Err(ParseError::InvalidSubtag);
}
} else if mode == ParserMode::Partial {
break;
} else {
return Err(ParseError::InvalidSubtag);
}
iter.next();
}
Ok(LanguageIdentifier {
language,
script,
region,
variants: subtags::Variants::from_short_slice_unchecked(variants),
})
}
#[cfg(feature = "alloc")]
pub fn parse_language_identifier(
t: &[u8],
mode: ParserMode,
) -> Result<LanguageIdentifier, ParseError> {
let mut iter = SubtagIterator::new(t);
parse_language_identifier_from_iter(&mut iter, mode)
}
#[expect(clippy::type_complexity)]
pub const fn parse_locale_with_single_variant_single_keyword_unicode_extension_from_iter(
mut iter: SubtagIterator,
mode: ParserMode,
) -> Result<
(
subtags::Language,
Option<subtags::Script>,
Option<subtags::Region>,
Option<subtags::Variant>,
Option<(extensions::unicode::Key, Option<Subtag>)>,
),
ParseError,
> {
let language;
let mut script = None;
let mut region = None;
let mut variant = None;
let mut keyword = None;
if let (i, Some(subtag)) = iter.next_const() {
iter = i;
match subtags::Language::try_from_utf8(subtag) {
Ok(l) => language = l,
Err(e) => return Err(e),
}
} else {
return Err(ParseError::InvalidLanguage);
}
let mut position = ParserPosition::Script;
while let Some(subtag) = iter.peek() {
if !matches!(mode, ParserMode::LanguageIdentifier) && subtag.len() == 1 {
break;
}
if matches!(position, ParserPosition::Script) {
if let Ok(s) = subtags::Script::try_from_utf8(subtag) {
script = Some(s);
position = ParserPosition::Region;
} else if let Ok(r) = subtags::Region::try_from_utf8(subtag) {
region = Some(r);
position = ParserPosition::Variant;
} else if let Ok(v) = subtags::Variant::try_from_utf8(subtag) {
// We cannot handle multiple variants in a const context
debug_assert!(variant.is_none());
variant = Some(v);
position = ParserPosition::Variant;
} else if matches!(mode, ParserMode::Partial) {
break;
} else {
return Err(ParseError::InvalidSubtag);
}
} else if matches!(position, ParserPosition::Region) {
if let Ok(s) = subtags::Region::try_from_utf8(subtag) {
region = Some(s);
position = ParserPosition::Variant;
} else if let Ok(v) = subtags::Variant::try_from_utf8(subtag) {
// We cannot handle multiple variants in a const context
debug_assert!(variant.is_none());
variant = Some(v);
position = ParserPosition::Variant;
} else if matches!(mode, ParserMode::Partial) {
break;
} else {
return Err(ParseError::InvalidSubtag);
}
} else if let Ok(v) = subtags::Variant::try_from_utf8(subtag) {
debug_assert!(matches!(position, ParserPosition::Variant));
if variant.is_some() {
// We cannot handle multiple variants in a const context
return Err(ParseError::InvalidSubtag);
}
variant = Some(v);
} else if matches!(mode, ParserMode::Partial) {
break;
} else {
return Err(ParseError::InvalidSubtag);
}
iter = iter.next_const().0;
}
if matches!(mode, ParserMode::Locale) {
if let Some(subtag) = iter.peek() {
match ExtensionType::try_from_utf8(subtag) {
Ok(ExtensionType::Unicode) => {
iter = iter.next_const().0;
if let Some(peek) = iter.peek() {
if Attribute::try_from_utf8(peek).is_ok() {
// We cannot handle Attributes in a const context
return Err(ParseError::InvalidSubtag);
}
}
let mut key = None;
let mut current_type = None;
while let Some(peek) = iter.peek() {
if peek.len() == 2 {
if key.is_some() {
// We cannot handle more than one Key in a const context
return Err(ParseError::InvalidSubtag);
}
match Key::try_from_utf8(peek) {
Ok(k) => key = Some(k),
Err(e) => return Err(e),
};
} else if key.is_some() {
match Value::parse_subtag_from_utf8(peek) {
Ok(Some(t)) => {
if current_type.is_some() {
// We cannot handle more than one type in a const context
return Err(ParseError::InvalidSubtag);
}
current_type = Some(t);
}
Ok(None) => {}
Err(e) => return Err(e),
}
} else {
break;
}
iter = iter.next_const().0;
}
if let Some(k) = key {
keyword = Some((k, current_type));
}
}
// We cannot handle Transform, Private, Other extensions in a const context
Ok(_) => return Err(ParseError::InvalidSubtag),
Err(e) => return Err(e),
}
}
}
Ok((language, script, region, variant, keyword))
}
#[expect(clippy::type_complexity)]
pub const fn parse_language_identifier_with_single_variant(
t: &[u8],
mode: ParserMode,
) -> Result<
(
subtags::Language,
Option<subtags::Script>,
Option<subtags::Region>,
Option<subtags::Variant>,
),
ParseError,
> {
let iter = SubtagIterator::new(t);
match parse_locale_with_single_variant_single_keyword_unicode_extension_from_iter(iter, mode) {
Ok((l, s, r, v, _)) => Ok((l, s, r, v)),
Err(e) => Err(e),
}
}

View File

@@ -0,0 +1,42 @@
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
use crate::extensions;
use crate::parser::{ParseError, ParserMode, SubtagIterator};
use crate::subtags::{self, Subtag};
#[cfg(feature = "alloc")]
use crate::Locale;
use super::parse_locale_with_single_variant_single_keyword_unicode_extension_from_iter;
#[cfg(feature = "alloc")]
pub fn parse_locale(t: &[u8]) -> Result<Locale, ParseError> {
let mut iter = SubtagIterator::new(t);
let id = super::parse_language_identifier_from_iter(&mut iter, ParserMode::Locale)?;
let extensions = if iter.peek().is_some() {
extensions::Extensions::try_from_iter(&mut iter)?
} else {
extensions::Extensions::default()
};
Ok(Locale { id, extensions })
}
#[expect(clippy::type_complexity)]
pub const fn parse_locale_with_single_variant_single_keyword_unicode_keyword_extension(
t: &[u8],
mode: ParserMode,
) -> Result<
(
subtags::Language,
Option<subtags::Script>,
Option<subtags::Region>,
Option<subtags::Variant>,
Option<(extensions::unicode::Key, Option<Subtag>)>,
),
ParseError,
> {
let iter = SubtagIterator::new(t);
parse_locale_with_single_variant_single_keyword_unicode_extension_from_iter(iter, mode)
}

185
vendor/icu_locale_core/src/parser/mod.rs vendored Normal file
View File

@@ -0,0 +1,185 @@
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
pub mod errors;
mod langid;
mod locale;
pub use errors::ParseError;
pub use langid::*;
pub use locale::*;
// Safety-usable invariant: returns a prefix of `slice`
const fn skip_before_separator(slice: &[u8]) -> &[u8] {
let mut end = 0;
// Invariant: end ≤ slice.len() since len is a nonnegative integer and end is 0
#[expect(clippy::indexing_slicing)] // very protected, should optimize out
while end < slice.len() && !matches!(slice[end], b'-') {
// Invariant at beginning of loop: end < slice.len()
// Advance until we reach end of slice or a separator.
end += 1;
// Invariant at end of loop: end ≤ slice.len()
}
// Notice: this slice may be empty for cases like `"en-"` or `"en--US"`
// SAFETY: end ≤ slice.len() by while loop
// Safety-usable invariant upheld: returned a prefix of the slice
unsafe { slice.split_at_unchecked(end).0 }
}
// `SubtagIterator` is a helper iterator for [`LanguageIdentifier`] and [`Locale`] parsing.
//
// It is quite extraordinary due to focus on performance and Rust limitations for `const`
// functions.
//
// The iterator is eager and fallible allowing it to reject invalid slices such as `"-"`, `"-en"`,
// `"en-"` etc.
//
// The iterator provides methods available for static users - `next_manual` and `peek_manual`,
// as well as typical `Peekable` iterator APIs - `next` and `peek`.
//
// All methods return an `Option` of a `Result`.
#[derive(Copy, Clone, Debug)]
pub struct SubtagIterator<'a> {
remaining: &'a [u8],
// Safety invariant: current is a prefix of remaining
current: Option<&'a [u8]>,
}
impl<'a> SubtagIterator<'a> {
pub const fn new(rest: &'a [u8]) -> Self {
Self {
remaining: rest,
// Safety invariant upheld: skip_before_separator() returns a prefix of `rest`
current: Some(skip_before_separator(rest)),
}
}
pub const fn next_const(mut self) -> (Self, Option<&'a [u8]>) {
let Some(result) = self.current else {
return (self, None);
};
self.current = if result.len() < self.remaining.len() {
// If there is more after `result`, by construction `current` starts with a separator
// SAFETY: `self.remaining` is strictly longer than `result` due to `result` being a prefix (from the safety invariant)
self.remaining = unsafe { self.remaining.split_at_unchecked(result.len() + 1).1 };
// Safety invariant upheld: skip_before_separator() returns a prefix of `rest`, and we don't
// mutate self.remaining after this
Some(skip_before_separator(self.remaining))
} else {
None
};
(self, Some(result))
}
pub const fn peek(&self) -> Option<&'a [u8]> {
self.current
}
}
impl<'a> Iterator for SubtagIterator<'a> {
type Item = &'a [u8];
fn next(&mut self) -> Option<Self::Item> {
let (s, res) = self.next_const();
*self = s;
res
}
}
#[cfg(test)]
mod test {
use super::*;
fn slice_to_str(input: &[u8]) -> &str {
std::str::from_utf8(input).unwrap()
}
#[test]
fn subtag_iterator_peek_test() {
let slice = "de-at-u-ca-foobar";
let mut si = SubtagIterator::new(slice.as_bytes());
assert_eq!(si.peek().map(slice_to_str), Some("de"));
assert_eq!(si.peek().map(slice_to_str), Some("de"));
assert_eq!(si.next().map(slice_to_str), Some("de"));
assert_eq!(si.peek().map(slice_to_str), Some("at"));
assert_eq!(si.peek().map(slice_to_str), Some("at"));
assert_eq!(si.next().map(slice_to_str), Some("at"));
}
#[test]
fn subtag_iterator_test() {
let slice = "";
let mut si = SubtagIterator::new(slice.as_bytes());
assert_eq!(si.next().map(slice_to_str), Some(""));
let slice = "-";
let mut si = SubtagIterator::new(slice.as_bytes());
assert_eq!(si.next().map(slice_to_str), Some(""));
let slice = "-en";
let mut si = SubtagIterator::new(slice.as_bytes());
assert_eq!(si.next().map(slice_to_str), Some(""));
assert_eq!(si.next().map(slice_to_str), Some("en"));
assert_eq!(si.next(), None);
let slice = "en";
let si = SubtagIterator::new(slice.as_bytes());
assert_eq!(si.map(slice_to_str).collect::<Vec<_>>(), vec!["en",]);
let slice = "en-";
let si = SubtagIterator::new(slice.as_bytes());
assert_eq!(si.map(slice_to_str).collect::<Vec<_>>(), vec!["en", "",]);
let slice = "--";
let mut si = SubtagIterator::new(slice.as_bytes());
assert_eq!(si.next().map(slice_to_str), Some(""));
assert_eq!(si.next().map(slice_to_str), Some(""));
assert_eq!(si.next().map(slice_to_str), Some(""));
assert_eq!(si.next(), None);
let slice = "-en-";
let mut si = SubtagIterator::new(slice.as_bytes());
assert_eq!(si.next().map(slice_to_str), Some(""));
assert_eq!(si.next().map(slice_to_str), Some("en"));
assert_eq!(si.next().map(slice_to_str), Some(""));
assert_eq!(si.next(), None);
let slice = "de-at-u-ca-foobar";
let si = SubtagIterator::new(slice.as_bytes());
assert_eq!(
si.map(slice_to_str).collect::<Vec<_>>(),
vec!["de", "at", "u", "ca", "foobar",]
);
}
#[test]
fn skip_before_separator_test() {
let current = skip_before_separator(b"");
assert_eq!(current, b"");
let current = skip_before_separator(b"en");
assert_eq!(current, b"en");
let current = skip_before_separator(b"en-");
assert_eq!(current, b"en");
let current = skip_before_separator(b"en--US");
assert_eq!(current, b"en");
let current = skip_before_separator(b"-US");
assert_eq!(current, b"");
let current = skip_before_separator(b"US");
assert_eq!(current, b"US");
let current = skip_before_separator(b"-");
assert_eq!(current, b"");
}
}

View File

@@ -0,0 +1,23 @@
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
//! A set of extensions which correspond to preferences.
//!
//! The module provides structures that represent known values for each keyword
//! in Locale [`extensions`](crate::extensions) with semantic meaning.
//!
//! # Syntactic vs Semantic Extension Handling
//!
//! This module ensures that only valid, recognized values are used, providing semantic validation.
//! It would reject invalid values such as `-u-hc-BB` because `BB` is not a known hour cycle. This
//! is ideal for applications that require strict adherence to standardized values and need to
//! prevent invalid or unrecognized data.
//!
//! If you need to construct syntactically valid Locale extensions without semantic validation,
//! allowing any valid key-value pair regardless of recognition, consider using the
//! [`crate::extensions`] module.
//!
//! [`Locale`]: crate::Locale
pub mod unicode;

View File

@@ -0,0 +1,15 @@
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
//! Errors related to parsing of Preferences.
/// Error returned by parsers of unicode extensions as preferences.
#[non_exhaustive]
#[derive(Debug, displaydoc::Display)]
pub enum PreferencesParseError {
/// The given keyword value is not a valid preference variant.
InvalidKeywordValue,
}
impl core::error::Error for PreferencesParseError {}

View File

@@ -0,0 +1,66 @@
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
#![allow(non_snake_case)]
use crate::preferences::extensions::unicode::enum_keyword;
enum_keyword!(
/// Hijri Calendar sub-type
///
/// The list is based on [`CLDR Calendars`](https://github.com/unicode-org/cldr/blob/main/common/bcp47/calendar.xml)
HijriCalendarAlgorithm {
/// Hijri calendar, Umm al-Qura
Umalqura,
/// Hijri calendar, tabular (intercalary years \[2,5,7,10,13,16,18,21,24,26,29] - astronomical epoch)
Tbla,
/// Hijri calendar, tabular (intercalary years \[2,5,7,10,13,16,18,21,24,26,29] - civil epoch)
Civil,
/// Hijri calendar, Saudi Arabia sighting
Rgsa
});
enum_keyword!(
/// A Unicode Calendar Identifier defines a type of calendar.
///
/// This selects calendar-specific data within a locale used for formatting and parsing,
/// such as date/time symbols and patterns; it also selects supplemental calendarData used
/// for calendrical calculations. The value can affect the computation of the first day of the week.
///
/// The valid values are listed in [LDML](https://unicode.org/reports/tr35/#UnicodeCalendarIdentifier).
CalendarAlgorithm {
/// Thai Buddhist calendar (same as Gregorian except for the year)
("buddhist" => Buddhist),
/// Traditional Chinese calendar
("chinese" => Chinese),
/// Coptic calendar
("coptic" => Coptic),
/// Traditional Korean calendar
("dangi" => Dangi),
/// Ethiopic calendar, Amete Alem (epoch approx. 5493 B.C.E)
("ethioaa" => Ethioaa),
/// Ethiopic calendar, Amete Mihret (epoch approx, 8 C.E.)
("ethiopic" => Ethiopic),
/// Gregorian calendar
("gregory" => Gregory),
/// Traditional Hebrew calendar
("hebrew" => Hebrew),
/// Indian calendar
("indian" => Indian),
/// Hijri calendar
("islamic" => Hijri(HijriCalendarAlgorithm) {
("umalqura" => Umalqura),
("tbla" => Tbla),
("civil" => Civil),
("rgsa" => Rgsa)
}),
/// ISO calendar (Gregorian calendar using the ISO 8601 calendar week rules)
("iso8601" => Iso8601),
/// Japanese Imperial calendar
("japanese" => Japanese),
/// Persian calendar
("persian" => Persian),
/// Republic of China calendar
("roc" => Roc)
}, "ca", s, if *s == value!("islamicc") { return Ok(Self::Hijri(Some(HijriCalendarAlgorithm::Civil))); });

View File

@@ -0,0 +1,75 @@
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
use crate::preferences::extensions::unicode::enum_keyword;
enum_keyword!(
/// A Unicode Collation Identifier defines a type of collation (sort order).
///
/// The valid values are listed in [LDML](https://unicode.org/reports/tr35/#UnicodeCollationIdentifier).
CollationType {
/// A previous version of the ordering, for compatibility
("compat" => Compat),
/// Dictionary style ordering (such as in Sinhala)
("dict" => Dict),
/// The default Unicode collation element table order
("ducet" => Ducet),
/// Recommended ordering for emoji characters
("emoji" => Emoji),
/// European ordering rules
("eor" => Eor),
/// Phonebook style ordering (such as in German)
("phonebk" => Phonebk),
/// Phonetic ordering (sorting based on pronunciation)
("phonetic" => Phonetic),
/// Pinyin ordering for Latin and for CJK characters (used in Chinese)
("pinyin" => Pinyin),
/// Special collation type for string search
("search" => Search),
/// Special collation type for Korean initial consonant search
("searchjl" => Searchjl),
/// Default ordering for each language
("standard" => Standard),
/// Pinyin ordering for Latin, stroke order for CJK characters (used in Chinese)
("stroke" => Stroke),
/// Traditional style ordering (such as in Spanish)
("trad" => Trad),
/// Pinyin ordering for Latin, Unihan radical-stroke ordering for CJK characters (used in Chinese)
("unihan" => Unihan),
/// Pinyin ordering for Latin, zhuyin order for Bopomofo and CJK characters (used in Chinese)
("zhuyin" => Zhuyin),
}, "co");
enum_keyword!(
/// Collation parameter key for ordering by case.
///
/// If set to upper, causes upper case to sort before lower case. If set to lower, causes lower case to sort before upper case.
/// Useful for locales that have already supported ordering but require different order of cases. Affects case and tertiary levels.
///
/// The defails see [LDML](https://unicode.org/reports/tr35/tr35-collation.html#Case_Parameters).
[Default]
CollationCaseFirst {
/// Upper case to be sorted before lower case
("upper" => Upper),
/// Lower case to be sorted before upper case
("lower" => Lower),
/// No special case ordering
[default]
("false" => False),
}, "kf");
enum_keyword!(
/// Collation parameter key for numeric handling.
///
/// If set to on, any sequence of Decimal Digits (General_Category = Nd in the UAX44) is sorted at a primary level with
/// its numeric value. For example, "1" < "2" < "10". The computed primary weights are all at the start of the digit
/// reordering group.
[Default]
CollationNumericOrdering {
/// A sequence of decimal digits is sorted at primary level with its numeric value
("true" => True),
/// No special handling for numeric ordering
[default]
("false" => False),
}, "kn");

View File

@@ -0,0 +1,31 @@
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
use crate::preferences::extensions::unicode::errors::PreferencesParseError;
use crate::preferences::extensions::unicode::struct_keyword;
use crate::{extensions::unicode::Value, subtags::Subtag};
use tinystr::TinyAsciiStr;
struct_keyword!(
/// A Unicode Currency Identifier defines a type of currency.
///
/// The valid values are listed in [LDML](https://unicode.org/reports/tr35/#UnicodeCurrencyIdentifier).
CurrencyType,
"cu",
TinyAsciiStr<3>,
|input: Value| {
if let Some(subtag) = input.into_single_subtag() {
let ts = subtag.as_tinystr();
if ts.len() == 3 && ts.is_ascii_alphabetic() {
return Ok(Self(ts.resize()));
}
}
Err(PreferencesParseError::InvalidKeywordValue)
},
|input: CurrencyType| {
crate::extensions::unicode::Value::from_subtag(Some(
Subtag::from_tinystr_unvalidated(input.0.resize()),
))
}
);

View File

@@ -0,0 +1,18 @@
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
use crate::preferences::extensions::unicode::enum_keyword;
enum_keyword!(
/// A Unicode Currency Format Identifier defines a style for currency formatting.
///
/// The valid values are listed in [LDML](https://unicode.org/reports/tr35/#UnicodeCurrencyFormatIdentifier).
[Default]
CurrencyFormatStyle {
/// Negative numbers use the minusSign symbol (the default)
[default]
("standard" => Standard),
/// Negative numbers use parentheses or equivalent
("account" => Account)
}, "cf");

View File

@@ -0,0 +1,34 @@
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
use crate::extensions::unicode::Value;
use crate::preferences::extensions::unicode::errors::PreferencesParseError;
use crate::preferences::extensions::unicode::struct_keyword;
use crate::subtags::Script;
use alloc::vec::Vec;
use core::str::FromStr;
struct_keyword!(
/// A Unicode Dictionary Break Exclusion Identifier specifies
/// scripts to be excluded from dictionary-based text break (for words and lines).
///
/// The valid values are of one or more items of type [`Script`](crate::subtags::Script).
///
/// ✨ *Enabled with the `alloc` Cargo feature.*
DictionaryBreakScriptExclusions,
"dx",
Vec<Script>,
|input: Value| {
input
.into_iter()
.map(|s| {
Script::from_str(s.as_str()).map_err(|_| PreferencesParseError::InvalidKeywordValue)
})
.collect::<Result<_, _>>()
.map(Self)
},
|input: DictionaryBreakScriptExclusions| {
input.0.into_iter().map(Into::into).collect()
}
);

View File

@@ -0,0 +1,23 @@
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
use crate::preferences::extensions::unicode::enum_keyword;
enum_keyword!(
/// A Unicode Emoji Presentation Style Identifier
///
/// It specifies a request for the preferred emoji
/// presentation style. This can be used as part of the value for an HTML lang attribute,
/// for example `<html lang="sr-Latn-u-em-emoji">`.
/// The valid values are listed in [LDML](https://unicode.org/reports/tr35/#UnicodeEmojiPresentationStyleIdentifier).
[Default]
EmojiPresentationStyle {
/// Use an emoji presentation for emoji characters if possible
("emoji" => Emoji),
/// Use a text presentation for emoji characters if possible
("text" => Text),
/// Use the default presentation for emoji characters as specified in UTR #51 Presentation Style
[default]
("default" => Default)
}, "em");

View File

@@ -0,0 +1,29 @@
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
use crate::preferences::extensions::unicode::enum_keyword;
enum_keyword!(
/// A Unicode First Day Identifier defines the preferred first day of the week for calendar display.
///
/// Specifying "fw" in a locale identifier overrides the default value specified by
/// supplemental week data for the region.
///
/// The valid values are listed in [LDML](https://unicode.org/reports/tr35/#UnicodeFirstDayIdentifier).
FirstDay {
/// Sunday
("sun" => Sun),
/// Monday
("mon" => Mon),
/// Tuesday
("tue" => Tue),
/// Wednesday
("wed" => Wed),
/// Thursday
("thu" => Thu),
/// Friday
("fri" => Fri),
/// Saturday
("sat" => Sat)
}, "fw");

View File

@@ -0,0 +1,18 @@
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
use crate::preferences::extensions::unicode::enum_keyword;
enum_keyword!(
/// A Unicode Hour Cycle Identifier defines the preferred time cycle. Specifying "hc" in a locale identifier overrides the default value specified by supplemental time data for the region.
///
/// The valid values are listed in [LDML](https://unicode.org/reports/tr35/#UnicodeHourCycleIdentifier).
HourCycle {
/// The typical 12-hour clock. Hours are numbered 112. Corresponds to 'h' in patterns.
("h12" => H12),
/// The 24-hour clock. Hour are numbered 023. Corresponds to 'H' in patterns.
("h23" => H23),
/// Variant of the 12-hour clock, sometimes used in Japan. Hours are numbered 011. Corresponds to 'K' in patterns.
("h11" => H11),
}, "hc");

View File

@@ -0,0 +1,21 @@
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
use crate::preferences::extensions::unicode::enum_keyword;
enum_keyword!(
/// A Unicode Line Break Style Identifier defines a preferred line break style corresponding to the CSS level 3 line-break option.
///
/// Specifying "lb" in a locale identifier overrides the locales default style
/// (which may correspond to "normal" or "strict").
///
/// The valid values are listed in [LDML](https://unicode.org/reports/tr35/#UnicodeLineBreakStyleIdentifier).
LineBreakStyle {
/// CSS level 3 line-break=strict, e.g. treat CJ as NS
("strict" => Strict),
/// CSS level 3 line-break=normal, e.g. treat CJ as ID, break before hyphens for ja,zh
("normal" => Normal),
/// CSS lev 3 line-break=loose
("loose" => Loose),
}, "lb");

View File

@@ -0,0 +1,23 @@
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
use crate::preferences::extensions::unicode::enum_keyword;
enum_keyword!(
/// A Unicode Line Break Word Identifier defines preferred line break word handling behavior corresponding to the CSS level 3 word-break option.
///
/// Specifying "lw" in a locale identifier overrides the locales default style (which may correspond to "normal" or "keepall").
///
/// The valid values are listed in [LDML](https://unicode.org/reports/tr35/#UnicodeLineBreakWordIdentifier).
LineBreakWordHandling {
/// CSS lev 3 word-break=normal, normal script/language behavior for midword breaks
("normal" => Normal),
/// CSS lev 3 word-break=break-all, allow midword breaks unless forbidden by lb setting
("breakall" => BreakAll),
/// CSS lev 3 word-break=keep-all, prohibit midword breaks except for dictionary breaks
("keepall" => KeepAll),
/// Prioritize keeping natural phrases (of multiple words) together when breaking,
/// used in short text like title and headline
("phrase" => Phrase),
}, "lw");

View File

@@ -0,0 +1,20 @@
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
use crate::preferences::extensions::unicode::enum_keyword;
enum_keyword!(
/// A Unicode Measurement System Identifier defines a preferred measurement system.
///
/// Specifying "ms" in a locale identifier overrides the default value specified by supplemental measurement system data for the region
///
/// The valid values are listed in [LDML](https://unicode.org/reports/tr35/#UnicodeMeasurementSystemIdentifier).
MeasurementSystem {
/// Metric System
("metric" => Metric),
/// US System of measurement: feet, pints, etc.; pints are 16oz
("ussystem" => USSystem),
/// UK System of measurement: feet, pints, etc.; pints are 20oz
("uksystem" => UKSystem)
}, "ms");

View File

@@ -0,0 +1,18 @@
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
use crate::preferences::extensions::unicode::enum_keyword;
enum_keyword!(
/// A Measurement Unit Preference Override defines an override for measurement unit preference.
///
/// The valid values are listed in [LDML](https://unicode.org/reports/tr35/#MeasurementUnitPreferenceOverride).
MeasurementUnitOverride {
/// Celsius as temperature unit
("celsius" => Celsius),
/// Kelvin as temperature unit
("kelvin" => Kelvin),
/// Fahrenheit as temperature unit
("fahrenhe" => Fahrenheit),
}, "mu");

View File

@@ -0,0 +1,46 @@
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
//! A list of Preferences derived from Locale unicode extension keywords.
#![allow(unused_imports)]
mod calendar;
pub use calendar::*;
mod collation;
pub use collation::*;
mod currency;
pub use currency::*;
mod currency_format;
pub use currency_format::*;
#[cfg(feature = "alloc")]
mod dictionary_break;
#[cfg(feature = "alloc")]
pub use dictionary_break::*;
mod emoji;
pub use emoji::*;
mod first_day;
pub use first_day::*;
mod hour_cycle;
pub use hour_cycle::*;
mod line_break;
pub use line_break::*;
mod line_break_word;
pub use line_break_word::*;
mod measurement_system;
pub use measurement_system::*;
mod measurement_unit_override;
pub use measurement_unit_override::*;
mod numbering_system;
pub use numbering_system::*;
mod region_override;
pub use region_override::*;
mod regional_subdivision;
pub use regional_subdivision::*;
mod sentence_supression;
pub use sentence_supression::*;
mod timezone;
pub use timezone::*;
mod variant;
pub use variant::*;

View File

@@ -0,0 +1,26 @@
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
use crate::preferences::extensions::unicode::errors::PreferencesParseError;
use crate::preferences::extensions::unicode::struct_keyword;
use crate::{extensions::unicode::Value, subtags::Subtag};
struct_keyword!(
/// A Unicode Number System Identifier defines a type of number system.
///
/// The valid values are listed in [LDML](https://unicode.org/reports/tr35/#UnicodeNumberSystemIdentifier).
[Copy]
NumberingSystem,
"nu",
Subtag,
|input: Value| {
input
.into_single_subtag()
.map(Self)
.ok_or(PreferencesParseError::InvalidKeywordValue)
},
|input: NumberingSystem| {
crate::extensions::unicode::Value::from_subtag(Some(input.0))
}
);

View File

@@ -0,0 +1,63 @@
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
use crate::extensions::unicode::{SubdivisionId, Value};
use crate::preferences::extensions::unicode::errors::PreferencesParseError;
use crate::preferences::extensions::unicode::struct_keyword;
struct_keyword!(
/// A Region Override specifies an alternate region to use for obtaining certain region-specific default values.
///
/// The valid values are listed in [LDML](https://unicode.org/reports/tr35/#RegionOverride).
[Copy]
RegionOverride,
"rg",
SubdivisionId,
|input: Value| {
input
.into_single_subtag()
.and_then(|subtag| subtag.as_str().parse().ok().map(Self))
.ok_or(PreferencesParseError::InvalidKeywordValue)
},
|input: RegionOverride| {
Value::from_subtag(Some(input.0.into_subtag()))
}
);
#[cfg(test)]
mod test {
use super::*;
use crate::extensions::unicode;
use crate::extensions::unicode::subdivision_suffix;
use crate::subtags::region;
#[test]
fn region_override_test() {
let val = unicode::value!("uksct");
let rg: RegionOverride = val.try_into().unwrap();
assert_eq!(rg.0.region, region!("UK"));
assert_eq!(rg.0.suffix, subdivision_suffix!("sct"));
let val = unicode::value!("usca");
let rg: RegionOverride = val.try_into().unwrap();
assert_eq!(rg.0.region, region!("US"));
assert_eq!(rg.0.suffix, subdivision_suffix!("ca"));
let val = unicode::value!("419bel");
let rg: RegionOverride = val.try_into().unwrap();
assert_eq!(rg.0.region, region!("419"));
assert_eq!(rg.0.suffix, subdivision_suffix!("bel"));
let val = unicode::value!("uszzzz");
let rg: RegionOverride = val.try_into().unwrap();
assert_eq!(rg.0.region, region!("us"));
assert_eq!(rg.0.suffix, subdivision_suffix!("zzzz"));
for i in &["4aabel", "a4bel", "ukabcde"] {
let val = unicode::Value::try_from_str(i).unwrap();
let rg: Result<RegionOverride, _> = val.try_into();
assert!(rg.is_err());
}
}
}

View File

@@ -0,0 +1,65 @@
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
use crate::preferences::extensions::unicode::errors::PreferencesParseError;
use crate::preferences::extensions::unicode::struct_keyword;
use crate::{
extensions::unicode::{SubdivisionId, Value},
subtags::Subtag,
};
struct_keyword!(
/// A Unicode Subdivision Identifier defines a regional subdivision used for locales.
///
/// The valid values are listed in [LDML](https://unicode.org/reports/tr35/#UnicodeSubdivisionIdentifier).
[Copy]
RegionalSubdivision,
"sd",
SubdivisionId,
|input: Value| {
input
.into_single_subtag()
.and_then(|subtag| subtag.as_str().parse().ok().map(Self))
.ok_or(PreferencesParseError::InvalidKeywordValue)
},
|input: RegionalSubdivision| {
let mut raw = [0; 8];
raw[0] = input.0.region.into_raw()[0];
raw[1] = input.0.region.into_raw()[1];
raw[2] = input.0.region.into_raw()[2];
let len = input.0.region.as_str().len();
debug_assert!((2..=3).contains(&len));
#[allow(clippy::indexing_slicing)] // safe
{
raw[len] = input.0.suffix.into_raw()[0];
raw[len + 1] = input.0.suffix.into_raw()[1];
raw[len + 2] = input.0.suffix.into_raw()[2];
raw[len + 3] = input.0.suffix.into_raw()[2];
}
#[expect(clippy::unwrap_used)] // correct by construction
Value::from_subtag(Some(Subtag::try_from_raw(raw).unwrap()))
}
);
#[cfg(test)]
mod test {
use super::*;
use crate::extensions::unicode;
use crate::extensions::unicode::subdivision_suffix;
use crate::subtags::region;
#[test]
fn region_subdivision_test() {
let val = unicode::value!("uksct");
let rg: RegionalSubdivision = val.try_into().unwrap();
assert_eq!(rg.region, region!("UK"));
assert_eq!(rg.suffix, subdivision_suffix!("sct"));
for i in &["4aabel", "a4bel", "ukabcde"] {
let val = unicode::Value::try_from_str(i).unwrap();
let rg: Result<RegionalSubdivision, _> = val.try_into();
assert!(rg.is_err());
}
}
}

View File

@@ -0,0 +1,19 @@
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
use crate::preferences::extensions::unicode::enum_keyword;
enum_keyword!(
/// A Unicode Sentence Break Suppressions Identifier defines a set of data to be used for suppressing certain
/// sentence breaks that would otherwise be found by UAX #14 rules.
///
/// The valid values are listed in [LDML](https://unicode.org/reports/tr35/#UnicodeSentenceBreakSuppressionsIdentifier).
[Default]
SentenceBreakSupressions {
/// Dont use sentence break suppressions data (the default)
[default]
("none" => None),
/// Use sentence break suppressions data of type "standard"
("standard" => Standard),
}, "ss");

View File

@@ -0,0 +1,26 @@
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
use crate::preferences::extensions::unicode::errors::PreferencesParseError;
use crate::preferences::extensions::unicode::struct_keyword;
use crate::{extensions::unicode::Value, subtags::Subtag};
struct_keyword!(
/// A Unicode Timezone Identifier defines a timezone.
///
/// The valid values are listed in [LDML](https://unicode.org/reports/tr35/#UnicodeTimezoneIdentifier).
[Copy]
TimeZoneShortId,
"tz",
Subtag,
|input: Value| {
input
.into_single_subtag()
.map(Self)
.ok_or(PreferencesParseError::InvalidKeywordValue)
},
|input: TimeZoneShortId| {
crate::extensions::unicode::Value::from_subtag(Some(input.0))
}
);

View File

@@ -0,0 +1,14 @@
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
use crate::preferences::extensions::unicode::enum_keyword;
enum_keyword!(
/// A Unicode Variant Identifier defines a special variant used for locales.
///
/// The valid values are listed in [LDML](https://unicode.org/reports/tr35/#UnicodeVariantIdentifier).
CommonVariantType {
/// POSIX style locale variant
("posix" => Posix),
}, "va");

View File

@@ -0,0 +1,322 @@
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
/// Internal macro used by `enum_keyword` for nesting.
#[macro_export]
#[doc(hidden)]
macro_rules! __enum_keyword_inner {
($name:ident, $variant:ident) => {
$name::$variant
};
($name:ident, $variant:ident, $s:ident, $v2:ident, $($subk:expr => $subv:ident),*) => {{
let sv = $s.get_subtag(1).and_then(|st| {
match st.as_str() {
$(
$subk => Some($v2::$subv),
)*
_ => None,
}
});
$name::$variant(sv)
}};
}
/// Macro used to generate a preference keyword as an enum.
///
/// The macro supports single and two subtag enums.
///
/// # Examples
///
/// ```
/// use icu::locale::preferences::extensions::unicode::enum_keyword;
///
/// enum_keyword!(
/// EmojiPresentationStyle {
/// ("emoji" => Emoji),
/// ("text" => Text),
/// ("default" => Default)
/// }, "em");
///
/// enum_keyword!(
/// MetaKeyword {
/// ("normal" => Normal),
/// ("emoji" => Emoji(EmojiPresentationStyle) {
/// ("emoji" => Emoji),
/// ("text" => Text),
/// ("default" => Default)
/// })
/// }, "mk");
/// ```
#[macro_export]
#[doc(hidden)]
macro_rules! __enum_keyword {
(
$(#[$doc:meta])*
$([$derive_attrs:ty])?
$name:ident {
$(
$(#[$variant_doc:meta])*
$([$variant_attr:ty])?
$variant:ident $($v2:ident)?
),*
}
) => {
#[non_exhaustive]
#[derive(Debug, Clone, Eq, PartialEq, Copy, Hash)]
$(#[derive($derive_attrs)])?
$(#[$doc])*
pub enum $name {
$(
$(#[$variant_doc])*
$(#[$variant_attr])?
$variant $((Option<$v2>))?
),*
}
};
($(#[$doc:meta])*
$([$derive_attrs:ty])?
$name:ident {
$(
$(#[$variant_doc:meta])*
$([$variant_attr:ty])?
($key:expr => $variant:ident $(($v2:ident) {
$(
($subk:expr => $subv:ident)
),*
})?)
),* $(,)?
},
$ext_key:literal
$(, $input:ident, $aliases:stmt)?
) => {
$crate::__enum_keyword!(
$(#[$doc])*
$([$derive_attrs])?
$name {
$(
$(#[$variant_doc])*
$([$variant_attr])?
$variant $($v2)?
),*
}
);
impl $crate::preferences::PreferenceKey for $name {
fn unicode_extension_key() -> Option<$crate::extensions::unicode::Key> {
Some($crate::extensions::unicode::key!($ext_key))
}
fn try_from_key_value(
key: &$crate::extensions::unicode::Key,
value: &$crate::extensions::unicode::Value,
) -> Result<Option<Self>, $crate::preferences::extensions::unicode::errors::PreferencesParseError> {
if Self::unicode_extension_key() == Some(*key) {
Self::try_from(value).map(Some)
} else {
Ok(None)
}
}
fn unicode_extension_value(&self) -> Option<$crate::extensions::unicode::Value> {
Some((*self).into())
}
}
impl TryFrom<&$crate::extensions::unicode::Value> for $name {
type Error = $crate::preferences::extensions::unicode::errors::PreferencesParseError;
fn try_from(s: &$crate::extensions::unicode::Value) -> Result<Self, Self::Error> {
let subtag = s.get_subtag(0)
// No subtag is equivalent to the "true" value.
.unwrap_or(&$crate::subtags::subtag!("true"));
#[allow(unused_imports)]
use $crate::extensions::unicode::value;
$(
let $input = s;
$aliases
)?
Ok(match subtag.as_str() {
$(
$key => {
$crate::__enum_keyword_inner!($name, $variant$(, s, $v2, $($subk => $subv),*)?)
}
)*
_ => {
return Err(Self::Error::InvalidKeywordValue);
}
})
}
}
impl From<$name> for $crate::extensions::unicode::Value {
fn from(input: $name) -> $crate::extensions::unicode::Value {
let f;
#[allow(unused_mut)]
let mut s = None;
match input {
$(
// This is circumventing a limitation of the macro_rules - we need to have a conditional
// $()? case here for when the variant has a value, and macro_rules require us to
// reference the $v2 inside it, but in match case it becomes a variable, so clippy
// complaints.
#[allow(non_snake_case)]
$name::$variant $(($v2))? => {
f = $crate::subtags::subtag!($key);
$(
if let Some(v2) = $v2 {
match v2 {
$(
$v2::$subv => s = Some($crate::subtags::subtag!($subk)),
)*
}
}
)?
},
)*
}
if let Some(s) = s {
$crate::extensions::unicode::Value::from_two_subtags(f, s)
} else {
$crate::extensions::unicode::Value::from_subtag(Some(f))
}
}
}
impl $name {
/// A helper function for displaying as a `&str`.
pub const fn as_str(&self) -> &'static str {
match self {
$(
// This is circumventing a limitation of the macro_rules - we need to have a conditional
// $()? case here for when the variant has a value, and macro_rules require us to
// reference the $v2 inside it, but in match case it becomes a variable, so clippy
// complaints.
#[allow(non_snake_case)]
Self::$variant $(($v2))? => {
$(
if let Some(v2) = $v2 {
return match v2 {
$(
$v2::$subv => concat!($key, '-', $subk),
)*
};
}
)?
return $key;
},
)*
}
}
}
};
}
pub use __enum_keyword as enum_keyword;
#[cfg(test)]
mod tests {
use super::*;
use crate::extensions::unicode;
use core::str::FromStr;
#[test]
fn enum_keywords_test() {
enum_keyword!(DummyKeyword {
("standard" => Standard),
("rare" => Rare),
}, "dk");
let v = unicode::Value::from_str("standard").unwrap();
let dk = DummyKeyword::try_from(&v).unwrap();
assert_eq!(dk, DummyKeyword::Standard);
assert_eq!(unicode::Value::from(dk), v);
let v = unicode::Value::from_str("rare").unwrap();
let dk = DummyKeyword::try_from(&v).unwrap();
assert_eq!(dk, DummyKeyword::Rare);
assert_eq!(unicode::Value::from(dk), v);
let v = unicode::Value::from_str("foo").unwrap();
let dk = DummyKeyword::try_from(&v);
assert!(dk.is_err());
assert_eq!(DummyKeyword::Standard.as_str(), "standard");
}
#[test]
fn enum_keywords_test_alias() {
enum_keyword!(DummyKeyword {
("standard" => Standard),
("rare" => Rare),
}, "dk", s, if *s == value!("std") { return Ok(Self::Standard) });
let v = unicode::Value::from_str("standard").unwrap();
let dk = DummyKeyword::try_from(&v).unwrap();
assert_eq!(dk, DummyKeyword::Standard);
assert_eq!(unicode::Value::from(dk), v);
let v_alias = unicode::Value::from_str("std").unwrap();
let dk = DummyKeyword::try_from(&v_alias).unwrap();
assert_eq!(dk, DummyKeyword::Standard);
assert_eq!(unicode::Value::from(dk), v);
let v = unicode::Value::from_str("rare").unwrap();
let dk = DummyKeyword::try_from(&v).unwrap();
assert_eq!(dk, DummyKeyword::Rare);
assert_eq!(unicode::Value::from(dk), v);
let v = unicode::Value::from_str("foo").unwrap();
let dk = DummyKeyword::try_from(&v);
assert!(dk.is_err());
assert_eq!(DummyKeyword::Standard.as_str(), "standard");
}
#[test]
fn enum_keywords_nested_test() {
enum_keyword!(DummySubKeyword { Standard, Rare });
enum_keyword!(DummyKeyword {
("default" => Default),
("sub" => Sub(DummySubKeyword) {
("standard" => Standard),
("rare" => Rare)
})
}, "dk");
let v = unicode::Value::from_str("default").unwrap();
let dk = DummyKeyword::try_from(&v).unwrap();
assert_eq!(dk, DummyKeyword::Default);
assert_eq!(unicode::Value::from(dk), v);
let v = unicode::Value::from_str("sub").unwrap();
let dk = DummyKeyword::try_from(&v).unwrap();
assert_eq!(dk, DummyKeyword::Sub(None));
assert_eq!(unicode::Value::from(dk), v);
let v = unicode::Value::from_str("foo").unwrap();
let dk = DummyKeyword::try_from(&v);
assert!(dk.is_err());
let v = unicode::Value::from_str("sub-standard").unwrap();
let dk = DummyKeyword::try_from(&v).unwrap();
assert_eq!(dk, DummyKeyword::Sub(Some(DummySubKeyword::Standard)));
assert_eq!(unicode::Value::from(dk), v);
let v = unicode::Value::from_str("sub-rare").unwrap();
let dk = DummyKeyword::try_from(&v).unwrap();
assert_eq!(dk, DummyKeyword::Sub(Some(DummySubKeyword::Rare)));
assert_eq!(unicode::Value::from(dk), v);
let v = unicode::Value::from_str("sub-foo").unwrap();
let dk = DummyKeyword::try_from(&v).unwrap();
assert_eq!(dk, DummyKeyword::Sub(None));
assert_eq!(unicode::Value::from(dk), unicode::value!("sub"));
assert_eq!(
DummyKeyword::Sub(Some(DummySubKeyword::Rare)).as_str(),
"sub-rare"
);
}
}

View File

@@ -0,0 +1,11 @@
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
mod enum_keyword;
mod struct_keyword;
#[doc(inline)]
pub use enum_keyword::enum_keyword;
#[doc(inline)]
pub use struct_keyword::struct_keyword;

View File

@@ -0,0 +1,124 @@
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
/// Macro used to generate a preference keyword as a struct.
///
/// # Examples
///
/// ```
/// use icu::locale::{
/// extensions::unicode::{Key, Value},
/// preferences::extensions::unicode::struct_keyword,
/// };
///
/// struct_keyword!(
/// CurrencyType,
/// "cu",
/// String,
/// |input: Value| { Ok(Self(input.to_string())) },
/// |input: CurrencyType| {
/// icu::locale::extensions::unicode::Value::try_from_str(
/// input.0.as_str(),
/// )
/// .unwrap()
/// }
/// );
/// ```
#[macro_export]
#[doc(hidden)]
macro_rules! __struct_keyword {
($(#[$doc:meta])* $([$derive_attrs:ty])? $name:ident, $ext_key:literal, $value:ty, $try_from:expr, $into:expr) => {
$(#[$doc])*
#[derive(Debug, Clone, Eq, PartialEq, Hash)]
$(#[derive($derive_attrs)])?
#[allow(clippy::exhaustive_structs)] // TODO
pub struct $name($value);
impl TryFrom<$crate::extensions::unicode::Value> for $name {
type Error = $crate::preferences::extensions::unicode::errors::PreferencesParseError;
fn try_from(
input: $crate::extensions::unicode::Value,
) -> Result<Self, Self::Error> {
$try_from(input)
}
}
impl From<$name> for $crate::extensions::unicode::Value {
fn from(input: $name) -> $crate::extensions::unicode::Value {
$into(input)
}
}
impl $crate::preferences::PreferenceKey for $name {
fn unicode_extension_key() -> Option<$crate::extensions::unicode::Key> {
Some($crate::extensions::unicode::key!($ext_key))
}
fn try_from_key_value(
key: &$crate::extensions::unicode::Key,
value: &$crate::extensions::unicode::Value,
) -> Result<Option<Self>, $crate::preferences::extensions::unicode::errors::PreferencesParseError> {
if Self::unicode_extension_key() == Some(*key) {
let result = Self::try_from(value.clone())?;
Ok(Some(result))
} else {
Ok(None)
}
}
fn unicode_extension_value(
&self,
) -> Option<$crate::extensions::unicode::Value> {
Some(self.clone().into())
}
}
impl core::ops::Deref for $name {
type Target = $value;
fn deref(&self) -> &Self::Target {
&self.0
}
}
};
}
pub use __struct_keyword as struct_keyword;
#[cfg(test)]
mod tests {
use super::*;
use crate::{
extensions::unicode,
subtags::{subtag, Subtag},
};
use core::str::FromStr;
#[test]
fn struct_keywords_test() {
struct_keyword!(
DummyKeyword,
"dk",
Subtag,
|input: unicode::Value| {
if let Some(subtag) = input.into_single_subtag() {
if subtag.len() == 3 {
return Ok(DummyKeyword(subtag));
}
}
Err(crate::preferences::extensions::unicode::errors::PreferencesParseError::InvalidKeywordValue)
},
|input: DummyKeyword| { unicode::Value::from_subtag(Some(input.0)) }
);
let v = unicode::Value::from_str("foo").unwrap();
let dk: DummyKeyword = v.clone().try_into().unwrap();
assert_eq!(dk, DummyKeyword(subtag!("foo")));
assert_eq!(unicode::Value::from(dk), v);
let v = unicode::Value::from_str("foobar").unwrap();
let dk: Result<DummyKeyword, _> = v.clone().try_into();
assert!(dk.is_err());
}
}

View File

@@ -0,0 +1,17 @@
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
//! A set of unicode extensions which correspond to preferences.
//!
//! The module contains a set structs corresponding to Locale [`unicode`](crate::extensions::unicode)
//! extensions for which ICU4X provides implementations of preferences.
//!
//! The macros in this module provide wrappers for creating preferences based on enums and structs.
//!
//! [`Locale`]: crate::Locale
pub mod errors;
pub mod keywords;
mod macros;
#[doc(inline)]
pub use macros::*;

View File

@@ -0,0 +1,181 @@
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
#[cfg(feature = "alloc")]
use crate::subtags::Variants;
use crate::subtags::{Language, Region, Script, Subtag, Variant};
use crate::DataLocale;
/// The structure storing locale subtags used in preferences.
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
pub struct LocalePreferences {
/// Preference of Language
pub(crate) language: Language,
/// Preference of Script
pub(crate) script: Option<Script>,
/// Preference of Region
pub(crate) region: Option<Region>,
/// Preference of Variant
pub(crate) variant: Option<Variant>,
/// Preference of Regional Subdivision
pub(crate) subdivision: Option<Subtag>,
/// Preference of Unicode Extension Region
pub(crate) ue_region: Option<Region>,
}
impl LocalePreferences {
fn to_data_locale_maybe_region_priority(self, region_priority: bool) -> DataLocale {
DataLocale {
language: self.language,
script: self.script,
region: match (self.region, self.ue_region) {
(Some(_), Some(r)) if region_priority => Some(r),
(r, _) => r,
},
variant: self.variant,
subdivision: self.subdivision,
}
}
/// Convert to a DataLocale, with region-based fallback priority
///
/// Most users should use `icu_provider::marker::make_locale()` instead.
pub fn to_data_locale_region_priority(self) -> DataLocale {
self.to_data_locale_maybe_region_priority(true)
}
/// Convert to a DataLocale, with language-based fallback priority
///
/// Most users should use `icu_provider::marker::make_locale()` instead.
pub fn to_data_locale_language_priority(self) -> DataLocale {
self.to_data_locale_maybe_region_priority(false)
}
}
impl Default for LocalePreferences {
fn default() -> Self {
Self::default()
}
}
impl From<&crate::Locale> for LocalePreferences {
fn from(loc: &crate::Locale) -> Self {
let sd = loc
.extensions
.unicode
.keywords
.get(&crate::extensions::unicode::key!("sd"))
.and_then(|v| v.as_single_subtag().copied());
let ue_region = loc
.extensions
.unicode
.keywords
.get(&crate::extensions::unicode::key!("rg"))
.and_then(|v| {
v.as_single_subtag()
.and_then(|s| Region::try_from_str(s.as_str()).ok())
});
Self {
language: loc.id.language,
script: loc.id.script,
region: loc.id.region,
variant: loc.id.variants.iter().copied().next(),
subdivision: sd,
ue_region,
}
}
}
impl From<&crate::LanguageIdentifier> for LocalePreferences {
fn from(lid: &crate::LanguageIdentifier) -> Self {
Self {
language: lid.language,
script: lid.script,
region: lid.region,
variant: lid.variants.iter().copied().next(),
subdivision: None,
ue_region: None,
}
}
}
/// ✨ *Enabled with the `alloc` Cargo feature.*
#[cfg(feature = "alloc")]
impl From<LocalePreferences> for crate::Locale {
fn from(prefs: LocalePreferences) -> Self {
Self {
id: crate::LanguageIdentifier {
language: prefs.language,
script: prefs.script,
region: prefs.region,
variants: prefs
.variant
.map(Variants::from_variant)
.unwrap_or_default(),
},
extensions: {
let mut extensions = crate::extensions::Extensions::default();
if let Some(sd) = prefs.subdivision {
extensions.unicode.keywords.set(
crate::extensions::unicode::key!("sd"),
crate::extensions::unicode::Value::from_subtag(Some(sd)),
);
}
if let Some(rg) = prefs.ue_region {
#[expect(clippy::unwrap_used)] // Region is a valid Subtag
extensions.unicode.keywords.set(
crate::extensions::unicode::key!("rg"),
crate::extensions::unicode::Value::try_from_str(rg.as_str()).unwrap(),
);
}
extensions
},
}
}
}
impl LocalePreferences {
/// Constructs a new [`LocalePreferences`] struct with the defaults.
pub const fn default() -> Self {
Self {
language: Language::UNKNOWN,
script: None,
region: None,
variant: None,
subdivision: None,
ue_region: None,
}
}
/// Preference of Language
pub const fn language(&self) -> Language {
self.language
}
/// Preference of Region
pub const fn region(&self) -> Option<Region> {
self.region
}
/// Extends the preferences with the values from another set of preferences.
pub fn extend(&mut self, other: LocalePreferences) {
if !other.language.is_unknown() {
self.language = other.language;
}
if let Some(script) = other.script {
self.script = Some(script);
}
if let Some(region) = other.region {
self.region = Some(region);
}
if let Some(variant) = other.variant {
self.variant = Some(variant);
}
if let Some(sd) = other.subdivision {
self.subdivision = Some(sd);
}
if let Some(ue_region) = other.ue_region {
self.ue_region = Some(ue_region);
}
}
}

View File

@@ -0,0 +1,634 @@
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
//! This API provides necessary functionality for building user preferences structs.
//!
//! It includes the ability to merge information between the struct and a [`Locale`],
//! facilitating the resolution of attributes against default values.
//!
//! Preferences struct serve as a composable argument to `ICU4X` constructors, allowing
//! for ergonomic merging between information encoded in multiple sets of user inputs:
//! Locale, application preferences and operating system preferences.
//!
//! The crate is intended primarily to be used by components constructors to normalize the format
//! of ingesting preferences across all of `ICU4X`.
//!
//! # Preferences vs Options
//!
//! ICU4X introduces a separation between two classes of parameters that are used
//! to adjust the behavior of a component.
//!
//! `Preferences` represent the user-driven preferences on how the given user wants the internationalization
//! to behave. Those are items like language, script, calendar and numbering systems etc.
//!
//! `Options` represent the developer-driven adjustments that affect how given information is presented
//! based on the requirements of the application like available space or intended tone.
//!
//! # Options Division
//!
//! The `Options` themselves are also divided into options that are affecting data slicing, and ones that don't.
//! This is necessary to allow for DCE and FFI to produce minimal outputs avoiding loading unnecessary data that
//! is never to be used by a given component.
//! The result is that some option keys affect specialized constructors such as `try_new_short`, `try_new_long`, which
//! result in data provider loading only data necessary to format short or long values respectively.
//! For options that are not affecting data slicing, an `Options` struct is provided that the developer
//! can fill with selected key values, or use the defaults.
//!
//! # Preferences Merging
//!
//! In traditional internationalization APIs, the argument passed to constructors is a locale.
//! ICU4X changes this paradigm by accepting a `Preferences`, which can be extracted from a [`Locale`] and combined with
//! other `Preferences`s provided by the environment.
//!
//! This approach makes it easy for developers to write code that takes just a locale, as in other systems,
//! as well as handle more sophisticated cases where the application may receive, for example, a locale,
//! a set of internationalization preferences specified within the application,
//! and a third set extracted from the operating system's preferences.
//!
//! # ECMA-402 vs ICU4X
//!
//! The result of the two paradigm shifts presented above is that the way constructors work is different.
//!
//! ## ECMA-402
//! ```ignore
//! let locale = new Locale("en-US-u-hc-h12");
//! let options = {
//! hourCycle: "h24", // user preference
//! timeStyle: "long", // developer option
//! };
//!
//! let dtf = new DateTimeFormat(locale, options);
//! ```
//!
//! ## ICU4X
//! ```ignore
//! let loc = locale!("en-US-u-hc-h12");
//! let prefs = DateTimeFormatterPreferences {
//! hour_cycle: HourCycle::H23,
//! };
//! let options = DateTimeFormatterOptions {
//! time_style: TimeStyle::Long,
//! };
//!
//! let mut combined_prefs = DateTimeFormatterPreferences::from(loc);
//! combined_prefs.extend(prefs);
//!
//! let dtf = DateTimeFormatter::try_new(combined_prefs, options);
//! ```
//!
//! This architecture allows for flexible composition of user and developer settings
//! sourced from different locations in custom ways based on the needs of each deployment.
//!
//! Below are some examples of how the `Preferences` model can be used in different setups.
//!
//! # Examples
//!
//! ```
//! use icu::locale::preferences::{
//! define_preferences,
//! extensions::unicode::keywords::HourCycle,
//! };
//! use icu::locale::locale;
//!
//! # fn get_data_locale_from_prefs(input: ExampleComponentPreferences) -> () { () }
//! # fn load_data(locale: ()) -> MyData { MyData {} }
//! # struct MyData {}
//! define_preferences!(
//! /// Name of the preferences struct
//! [Copy]
//! ExampleComponentPreferences,
//! {
//! /// A preference relevant to the component
//! hour_cycle: HourCycle
//! }
//! );
//!
//! pub struct ExampleComponent {
//! data: MyData,
//! }
//!
//! impl ExampleComponent {
//! pub fn new(prefs: ExampleComponentPreferences) -> Self {
//! let locale = get_data_locale_from_prefs(prefs);
//! let data = load_data(locale);
//!
//! Self { data }
//! }
//! }
//! ```
//!
//! Now we can use that component in multiple different ways,
//!
//! ## Scenario 1: Use Locale as the only input
//! ```
//! # use icu::locale::preferences::{
//! # define_preferences,
//! # extensions::unicode::keywords::HourCycle,
//! # };
//! # use icu::locale::locale;
//! # fn get_data_locale_from_prefs(input: ExampleComponentPreferences) -> () { () }
//! # fn load_data(locale: ()) -> MyData { MyData {} }
//! # struct MyData {}
//! # define_preferences!(
//! # /// Name of the preferences struct
//! # [Copy]
//! # ExampleComponentPreferences,
//! # {
//! # /// A preference relevant to the component
//! # hour_cycle: HourCycle
//! # }
//! # );
//! #
//! # pub struct ExampleComponent {
//! # data: MyData,
//! # }
//! # impl ExampleComponent {
//! # pub fn new(prefs: ExampleComponentPreferences) -> Self {
//! # let locale = get_data_locale_from_prefs(prefs);
//! # let data = load_data(locale);
//! #
//! # Self { data }
//! # }
//! # }
//! let loc = locale!("en-US-u-hc-h23");
//! let tf = ExampleComponent::new(loc.into());
//! ```
//!
//! ## Scenario 2: Compose Preferences and Locale
//! ```
//! # use icu::locale::preferences::{
//! # define_preferences,
//! # extensions::unicode::keywords::HourCycle,
//! # };
//! # use icu::locale::locale;
//! # fn get_data_locale_from_prefs(input: ExampleComponentPreferences) -> () { () }
//! # fn load_data(locale: ()) -> MyData { MyData {} }
//! # struct MyData {}
//! # define_preferences!(
//! # /// Name of the preferences struct
//! # [Copy]
//! # ExampleComponentPreferences,
//! # {
//! # /// A preference relevant to the component
//! # hour_cycle: HourCycle
//! # }
//! # );
//! #
//! # pub struct ExampleComponent {
//! # data: MyData,
//! # }
//! # impl ExampleComponent {
//! # pub fn new(prefs: ExampleComponentPreferences) -> Self {
//! # let locale = get_data_locale_from_prefs(prefs);
//! # let data = load_data(locale);
//! #
//! # Self { data }
//! # }
//! # }
//! let loc = locale!("en-US-u-hc-h23");
//! let app_prefs = ExampleComponentPreferences {
//! hour_cycle: Some(HourCycle::H12),
//! ..Default::default()
//! };
//!
//! let mut combined_prefs = ExampleComponentPreferences::from(loc);
//! combined_prefs.extend(app_prefs);
//!
//! // HourCycle is set from the prefs bag and override the value from the locale
//! assert_eq!(combined_prefs.hour_cycle, Some(HourCycle::H12));
//!
//! let tf = ExampleComponent::new(combined_prefs);
//! ```
//!
//! ## Scenario 3: Merge Preferences from Locale, OS, and Application
//! ```
//! # use icu::locale::preferences::{
//! # define_preferences,
//! # extensions::unicode::keywords::HourCycle,
//! # };
//! # use icu::locale::locale;
//! # fn get_data_locale_from_prefs(input: ExampleComponentPreferences) -> () { () }
//! # fn load_data(locale: ()) -> MyData { MyData {} }
//! # struct MyData {}
//! # define_preferences!(
//! # /// Name of the preferences struct
//! # [Copy]
//! # ExampleComponentPreferences,
//! # {
//! # /// A preference relevant to the component
//! # hour_cycle: HourCycle
//! # }
//! # );
//! #
//! # pub struct ExampleComponent {
//! # data: MyData,
//! # }
//! # impl ExampleComponent {
//! # pub fn new(prefs: ExampleComponentPreferences) -> Self {
//! # let locale = get_data_locale_from_prefs(prefs);
//! # let data = load_data(locale);
//! #
//! # Self { data }
//! # }
//! # }
//! let loc = locale!("en-US");
//!
//! // Simulate OS preferences
//! let os_prefs = ExampleComponentPreferences {
//! hour_cycle: Some(HourCycle::H23),
//! ..Default::default()
//! };
//!
//! // Application does not specify hour_cycle
//! let app_prefs = ExampleComponentPreferences {
//! hour_cycle: None,
//! ..Default::default()
//! };
//!
//! let mut combined_prefs = ExampleComponentPreferences::from(loc);
//! combined_prefs.extend(os_prefs);
//! combined_prefs.extend(app_prefs);
//!
//! // HourCycle is set from the OS preferences since the application didn't specify it
//! assert_eq!(combined_prefs.hour_cycle, Some(HourCycle::H23));
//!
//! let tf = ExampleComponent::new(combined_prefs);
//! ```
//!
//! ## Scenario 4: Neither Application nor OS specify the preference
//! ```
//! # use icu::locale::preferences::{
//! # define_preferences,
//! # extensions::unicode::keywords::HourCycle,
//! # };
//! # use icu::locale::locale;
//! # fn get_data_locale_from_prefs(input: ExampleComponentPreferences) -> () { () }
//! # fn load_data(locale: ()) -> MyData { MyData {} }
//! # struct MyData {}
//! # define_preferences!(
//! # /// Name of the preferences struct
//! # [Copy]
//! # ExampleComponentPreferences,
//! # {
//! # /// A preference relevant to the component
//! # hour_cycle: HourCycle
//! # }
//! # );
//! #
//! # pub struct ExampleComponent {
//! # data: MyData,
//! # }
//! # impl ExampleComponent {
//! # pub fn new(prefs: ExampleComponentPreferences) -> Self {
//! # let locale = get_data_locale_from_prefs(prefs);
//! # let data = load_data(locale);
//! #
//! # Self { data }
//! # }
//! # }
//! let loc = locale!("en-US-u-hc-h23");
//!
//! // Simulate OS preferences
//! let os_prefs = ExampleComponentPreferences::default(); // OS does not specify hour_cycle
//! let app_prefs = ExampleComponentPreferences::default(); // Application does not specify hour_cycle
//!
//! let mut combined_prefs = ExampleComponentPreferences::from(loc);
//! combined_prefs.extend(os_prefs);
//! combined_prefs.extend(app_prefs);
//!
//! // HourCycle is taken from the locale
//! assert_eq!(combined_prefs.hour_cycle, Some(HourCycle::H23));
//!
//! let tf = ExampleComponent::new(combined_prefs);
//! ```
//!
//! [`ICU4X`]: ../icu/index.html
//! [`Locale`]: crate::Locale
pub mod extensions;
mod locale;
pub use locale::*;
/// A low-level trait implemented on each preference exposed in component preferences.
///
/// [`PreferenceKey`] has to be implemented on
/// preferences that are to be included in Formatter preferences.
/// The trait may be implemented to indicate that the given preference has
/// a unicode key corresponding to it or be a custom one.
///
/// `ICU4X` provides an implementation of [`PreferenceKey`] for all
/// Unicode Extension Keys. The only external use of this trait is to implement
/// it on custom preferences that are to be included in a component preferences bag.
///
/// The below example show cases a manual generation of an `em` (emoji) unicode extension key
/// and a custom struct to showcase the difference in their behavior. For all use purposes,
/// the [`EmojiPresentationStyle`](crate::preferences::extensions::unicode::keywords::EmojiPresentationStyle) preference exposed by this crate should be used.
///
/// # Examples
/// ```
/// use icu::locale::{
/// extensions::unicode::{key, Key, value, Value},
/// preferences::{
/// define_preferences, PreferenceKey,
/// extensions::unicode::errors::PreferencesParseError,
/// },
/// };
///
/// #[non_exhaustive]
/// #[derive(Debug, Clone, Eq, PartialEq, Copy, Hash, Default)]
/// pub enum EmojiPresentationStyle {
/// Emoji,
/// Text,
/// #[default]
/// Default,
/// }
///
/// impl PreferenceKey for EmojiPresentationStyle {
/// fn unicode_extension_key() -> Option<Key> {
/// Some(key!("em"))
/// }
///
/// fn try_from_key_value(
/// key: &Key,
/// value: &Value,
/// ) -> Result<Option<Self>, PreferencesParseError> {
/// if Self::unicode_extension_key() == Some(*key) {
/// let subtag = value.as_single_subtag()
/// .ok_or(PreferencesParseError::InvalidKeywordValue)?;
/// match subtag.as_str() {
/// "emoji" => Ok(Some(Self::Emoji)),
/// "text" => Ok(Some(Self::Text)),
/// "default" => Ok(Some(Self::Default)),
/// _ => Err(PreferencesParseError::InvalidKeywordValue)
/// }
/// } else {
/// Ok(None)
/// }
/// }
///
/// fn unicode_extension_value(&self) -> Option<Value> {
/// Some(match self {
/// EmojiPresentationStyle::Emoji => value!("emoji"),
/// EmojiPresentationStyle::Text => value!("text"),
/// EmojiPresentationStyle::Default => value!("default"),
/// })
/// }
/// }
///
/// #[non_exhaustive]
/// #[derive(Debug, Clone, Eq, PartialEq, Hash)]
/// pub struct CustomFormat {
/// value: String
/// }
///
/// impl PreferenceKey for CustomFormat {}
///
/// define_preferences!(
/// MyFormatterPreferences,
/// {
/// emoji: EmojiPresentationStyle,
/// custom: CustomFormat
/// }
/// );
/// ```
/// [`ICU4X`]: ../icu/index.html
pub trait PreferenceKey: Sized {
/// Optional constructor of the given preference. It takes the
/// unicode extension key and if the key matches it attemptes to construct
/// the preference based on the given value.
/// If the value is not a valid value for the given key, the constructor throws.
fn try_from_key_value(
_key: &crate::extensions::unicode::Key,
_value: &crate::extensions::unicode::Value,
) -> Result<Option<Self>, crate::preferences::extensions::unicode::errors::PreferencesParseError>
{
Ok(None)
}
/// Retrieve unicode extension key corresponding to a given preference.
fn unicode_extension_key() -> Option<crate::extensions::unicode::Key> {
None
}
/// Retrieve unicode extension value corresponding to the given instance of the preference.
fn unicode_extension_value(&self) -> Option<crate::extensions::unicode::Value> {
None
}
}
/// A macro to facilitate generation of preferences struct.
///
///
/// The generated preferences struct provides methods for merging and converting between [`Locale`] and
/// the preference bag. See [`preferences`](crate::preferences) for use cases.
///
/// In the example below, the input argument is the generated preferences struct which
/// can be auto-converted from a Locale, or combined from a Locale and Preferences Bag.
///
/// # Examples
/// ```
/// use icu::locale::{
/// preferences::{
/// define_preferences,
/// extensions::unicode::keywords::HourCycle
/// },
/// locale,
/// };
///
/// define_preferences!(
/// [Copy]
/// NoCalendarFormatterPreferences,
/// {
/// hour_cycle: HourCycle
/// }
/// );
///
/// struct NoCalendarFormatter {}
///
/// impl NoCalendarFormatter {
/// pub fn try_new(prefs: NoCalendarFormatterPreferences) -> Result<Self, ()> {
/// // load data and set struct fields based on the prefs input
/// Ok(Self {})
/// }
/// }
///
/// let loc = locale!("en-US");
///
/// let tf = NoCalendarFormatter::try_new(loc.into());
/// ```
///
/// [`Locale`]: crate::Locale
#[macro_export]
#[doc(hidden)]
macro_rules! __define_preferences {
(
$(#[$doc:meta])*
$([$derive_attrs:ty])?
$name:ident,
{
$(
$(#[$key_doc:meta])*
$key:ident: $pref:ty
),*
}
) => (
$(#[$doc])*
#[derive(Default, Debug, Clone, PartialEq, Eq, Hash)]
$(#[derive($derive_attrs)])?
#[non_exhaustive]
pub struct $name {
/// Locale Preferences for the Preferences structure.
pub locale_preferences: $crate::preferences::LocalePreferences,
$(
$(#[$key_doc])*
pub $key: Option<$pref>,
)*
}
impl From<$crate::Locale> for $name {
fn from(loc: $crate::Locale) -> Self {
$name::from(&loc)
}
}
impl From<&$crate::Locale> for $name {
fn from(loc: &$crate::Locale) -> Self {
$name::from_locale_strict(loc).unwrap_or_else(|e| e)
}
}
impl From<$crate::LanguageIdentifier> for $name {
fn from(lid: $crate::LanguageIdentifier) -> Self {
$name::from(&lid)
}
}
impl From<&$crate::LanguageIdentifier> for $name {
fn from(lid: &$crate::LanguageIdentifier) -> Self {
Self {
locale_preferences: lid.into(),
$(
$key: None,
)*
}
}
}
// impl From<$name> for $crate::Locale {
// fn from(other: $name) -> Self {
// use $crate::preferences::PreferenceKey;
// let mut result = Self::from(other.locale_preferences);
// $(
// if let Some(value) = other.$key {
// if let Some(ue) = <$pref>::unicode_extension_key() {
// let val = value.unicode_extension_value().unwrap();
// result.extensions.unicode.keywords.set(ue, val);
// }
// }
// )*
// result
// }
// }
impl $name {
/// Extends the preferences with the values from another set of preferences.
pub fn extend(&mut self, other: $name) {
self.locale_preferences.extend(other.locale_preferences);
$(
if let Some(value) = other.$key {
self.$key = Some(value);
}
)*
}
#[doc = concat!("Construct a `", stringify!($name), "` from a `Locale`")]
///
/// Returns `Err` if any of of the preference values are invalid.
pub fn from_locale_strict(loc: &$crate::Locale) -> Result<Self, Self> {
use $crate::preferences::PreferenceKey;
let mut is_err = false;
$(
let mut $key = None;
)*
for (k, v) in loc.extensions.unicode.keywords.iter() {
$(
match <$pref>::try_from_key_value(k, v) {
Ok(Some(k)) => {
$key = Some(k);
continue;
}
Ok(None) => {}
Err(_) => {
is_err = true
}
}
)*
}
let r = Self {
locale_preferences: loc.into(),
$(
$key,
)*
};
if is_err {
Err(r)
} else {
Ok(r)
}
}
}
)
}
#[macro_export]
#[doc(hidden)]
macro_rules! __prefs_convert {
(
$name1:ident,
$name2:ident
) => {
impl From<&$name1> for $name2 {
fn from(other: &$name1) -> Self {
let mut result = Self::default();
result.locale_preferences = other.locale_preferences;
result
}
}
};
(
$name1:ident,
$name2:ident,
{
$(
$key:ident
),*
}
) => {
impl From<&$name1> for $name2 {
fn from(other: &$name1) -> Self {
let mut result = Self::default();
result.locale_preferences = other.locale_preferences;
$(
result.$key = other.$key;
)*
result
}
}
};
}
#[doc(inline)]
pub use __define_preferences as define_preferences;
#[doc(inline)]
pub use __prefs_convert as prefs_convert;

197
vendor/icu_locale_core/src/serde.rs vendored Normal file
View File

@@ -0,0 +1,197 @@
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
use crate::{LanguageIdentifier, Locale};
use core::{fmt::Display, marker::PhantomData, str::FromStr};
use serde::{Deserialize, Deserializer, Serialize, Serializer};
use writeable::Writeable;
impl Serialize for LanguageIdentifier {
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
where
S: Serializer,
{
serializer.serialize_str(&self.write_to_string())
}
}
impl Serialize for Locale {
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
where
S: Serializer,
{
serializer.serialize_str(&self.write_to_string())
}
}
struct ParseVisitor<T>(PhantomData<T>);
impl<T> serde::de::Visitor<'_> for ParseVisitor<T>
where
T: FromStr,
<T as FromStr>::Err: Display,
{
type Value = T;
fn expecting(&self, formatter: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
write!(formatter, "a valid Unicode Language or Locale Identifier")
}
fn visit_str<E>(self, s: &str) -> Result<Self::Value, E>
where
E: serde::de::Error,
{
s.parse::<T>().map_err(serde::de::Error::custom)
}
}
impl<'de> Deserialize<'de> for LanguageIdentifier {
fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
where
D: Deserializer<'de>,
{
deserializer.deserialize_str(ParseVisitor(PhantomData))
}
}
impl<'de> Deserialize<'de> for Locale {
fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
where
D: Deserializer<'de>,
{
deserializer.deserialize_str(ParseVisitor(PhantomData))
}
}
#[test]
fn json() {
use crate::subtags::{Language, Region, Script};
use crate::{langid, locale};
assert_eq!(
serde_json::to_string(&langid!("en-US")).unwrap(),
r#""en-US""#
);
assert_eq!(
serde_json::from_str::<LanguageIdentifier>(r#""en-US""#).unwrap(),
langid!("en-US")
);
assert_eq!(
serde_json::from_reader::<_, LanguageIdentifier>(&br#""en-US""#[..]).unwrap(),
langid!("en-US")
);
assert!(serde_json::from_str::<LanguageIdentifier>(r#""2Xs""#).is_err());
assert_eq!(
serde_json::to_string(&locale!("en-US-u-hc-h12")).unwrap(),
r#""en-US-u-hc-h12""#
);
assert_eq!(
serde_json::from_str::<Locale>(r#""en-US-u-hc-h12""#).unwrap(),
locale!("en-US-u-hc-h12")
);
assert_eq!(
serde_json::from_reader::<_, Locale>(&br#""en-US-u-hc-h12""#[..]).unwrap(),
locale!("en-US-u-hc-h12")
);
assert!(serde_json::from_str::<Locale>(r#""2Xs""#).is_err());
assert_eq!(
serde_json::to_string(&"fr".parse::<Language>().unwrap()).unwrap(),
r#""fr""#
);
assert_eq!(
serde_json::from_str::<Language>(r#""fr""#).unwrap(),
"fr".parse::<Language>().unwrap()
);
assert_eq!(
serde_json::from_reader::<_, Language>(&br#""fr""#[..]).unwrap(),
"fr".parse::<Language>().unwrap()
);
assert!(serde_json::from_str::<Language>(r#""2Xs""#).is_err());
assert_eq!(
serde_json::to_string(&"Latn".parse::<Script>().unwrap()).unwrap(),
r#""Latn""#
);
assert_eq!(
serde_json::from_str::<Script>(r#""Latn""#).unwrap(),
"Latn".parse::<Script>().unwrap()
);
assert_eq!(
serde_json::from_reader::<_, Script>(&br#""Latn""#[..]).unwrap(),
"Latn".parse::<Script>().unwrap()
);
assert!(serde_json::from_str::<Script>(r#""2Xs""#).is_err());
assert_eq!(
serde_json::to_string(&"US".parse::<Region>().unwrap()).unwrap(),
r#""US""#
);
assert_eq!(
serde_json::from_str::<Region>(r#""US""#).unwrap(),
"US".parse::<Region>().unwrap()
);
assert_eq!(
serde_json::from_reader::<_, Region>(&br#""US""#[..]).unwrap(),
"US".parse::<Region>().unwrap()
);
assert!(serde_json::from_str::<Region>(r#""2Xs""#).is_err());
}
#[test]
fn postcard() {
use crate::subtags::{Language, Region, Script};
use crate::{langid, locale};
assert_eq!(
postcard::to_stdvec(&langid!("en-US")).unwrap(),
b"\x05en-US"
);
assert_eq!(
postcard::from_bytes::<LanguageIdentifier>(b"\x05en-US").unwrap(),
langid!("en-US")
);
assert!(postcard::from_bytes::<LanguageIdentifier>(b"\x032Xs").is_err());
assert_eq!(
postcard::to_stdvec(&locale!("en-US-u-hc-h12")).unwrap(),
b"\x0Een-US-u-hc-h12"
);
assert_eq!(
postcard::from_bytes::<Locale>(b"\x0Een-US-u-hc-h12").unwrap(),
locale!("en-US-u-hc-h12")
);
assert!(postcard::from_bytes::<Locale>(b"\x032Xs").is_err());
assert_eq!(
postcard::to_stdvec(&"fr".parse::<Language>().unwrap()).unwrap(),
b"fr\0"
);
assert_eq!(
postcard::from_bytes::<Language>(b"fr\0").unwrap(),
"fr".parse::<Language>().unwrap()
);
assert!(postcard::from_bytes::<Language>(b"2Xs").is_err());
assert_eq!(
postcard::to_stdvec(&"Latn".parse::<Script>().unwrap()).unwrap(),
b"Latn"
);
assert_eq!(
postcard::from_bytes::<Script>(b"Latn").unwrap(),
"Latn".parse::<Script>().unwrap()
);
assert!(postcard::from_bytes::<Script>(b"2Xss").is_err());
assert_eq!(
postcard::to_stdvec(&"US".parse::<Region>().unwrap()).unwrap(),
b"US\0"
);
assert_eq!(
postcard::from_bytes::<Region>(b"US\0").unwrap(),
"US".parse::<Region>().unwrap()
);
assert!(postcard::from_bytes::<Region>(b"2Xs").is_err());
}

View File

@@ -0,0 +1,200 @@
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
use super::ShortBoxSlice;
use super::ShortBoxSliceInner;
#[cfg(feature = "alloc")]
use super::ShortBoxSliceIntoIter;
use litemap::store::*;
impl<K, V> StoreConstEmpty<K, V> for ShortBoxSlice<(K, V)> {
const EMPTY: ShortBoxSlice<(K, V)> = ShortBoxSlice::new();
}
impl<K, V> StoreSlice<K, V> for ShortBoxSlice<(K, V)> {
type Slice = [(K, V)];
#[inline]
fn lm_get_range(&self, range: core::ops::Range<usize>) -> Option<&Self::Slice> {
self.get(range)
}
}
impl<K, V> Store<K, V> for ShortBoxSlice<(K, V)> {
#[inline]
fn lm_len(&self) -> usize {
self.len()
}
#[inline]
fn lm_is_empty(&self) -> bool {
use ShortBoxSliceInner::*;
matches!(self.0, ZeroOne(None))
}
#[inline]
fn lm_get(&self, index: usize) -> Option<(&K, &V)> {
self.get(index).map(|elt| (&elt.0, &elt.1))
}
#[inline]
fn lm_last(&self) -> Option<(&K, &V)> {
use ShortBoxSliceInner::*;
match self.0 {
ZeroOne(ref v) => v.as_ref(),
#[cfg(feature = "alloc")]
Multi(ref v) => v.last(),
#[cfg(not(feature = "alloc"))]
Two([_, ref v]) => Some(v),
}
.map(|elt| (&elt.0, &elt.1))
}
#[inline]
fn lm_binary_search_by<F>(&self, mut cmp: F) -> Result<usize, usize>
where
F: FnMut(&K) -> core::cmp::Ordering,
{
self.binary_search_by(|(k, _)| cmp(k))
}
}
#[cfg(feature = "alloc")]
impl<K: Ord, V> StoreFromIterable<K, V> for ShortBoxSlice<(K, V)> {
fn lm_sort_from_iter<I: IntoIterator<Item = (K, V)>>(iter: I) -> Self {
alloc::vec::Vec::lm_sort_from_iter(iter).into()
}
}
#[cfg(feature = "alloc")]
impl<K, V> StoreMut<K, V> for ShortBoxSlice<(K, V)> {
fn lm_with_capacity(_capacity: usize) -> Self {
ShortBoxSlice::new()
}
fn lm_reserve(&mut self, _additional: usize) {}
fn lm_get_mut(&mut self, index: usize) -> Option<(&K, &mut V)> {
self.get_mut(index).map(|elt| (&elt.0, &mut elt.1))
}
fn lm_push(&mut self, key: K, value: V) {
self.push((key, value))
}
fn lm_insert(&mut self, index: usize, key: K, value: V) {
self.insert(index, (key, value))
}
fn lm_remove(&mut self, index: usize) -> (K, V) {
self.remove(index)
}
fn lm_clear(&mut self) {
self.clear();
}
}
#[cfg(feature = "alloc")]
impl<K: Ord, V> StoreBulkMut<K, V> for ShortBoxSlice<(K, V)> {
fn lm_retain<F>(&mut self, mut predicate: F)
where
F: FnMut(&K, &V) -> bool,
{
self.retain(|(k, v)| predicate(k, v))
}
fn lm_extend<I>(&mut self, other: I)
where
I: IntoIterator<Item = (K, V)>,
{
let mut other = other.into_iter();
// Use an Option to hold the first item of the map and move it to
// items if there are more items. Meaning that if items is not
// empty, first is None.
let mut first = None;
let mut items = alloc::vec::Vec::new();
match core::mem::take(&mut self.0) {
ShortBoxSliceInner::ZeroOne(zo) => {
first = zo;
// Attempt to avoid the items allocation by advancing the iterator
// up to two times. If we eventually find a second item, we can
// lm_extend the Vec and with the first, next (second) and the rest
// of the iterator.
while let Some(next) = other.next() {
if let Some(first) = first.take() {
// lm_extend will take care of sorting and deduplicating
// first, next and the rest of the other iterator.
items.lm_extend([first, next].into_iter().chain(other));
break;
}
first = Some(next);
}
}
ShortBoxSliceInner::Multi(existing_items) => {
items.reserve_exact(existing_items.len() + other.size_hint().0);
// We use a plain extend with existing items, which are already valid and
// lm_extend will fold over rest of the iterator sorting and deduplicating as needed.
items.extend(existing_items);
items.lm_extend(other);
}
}
if items.is_empty() {
debug_assert!(items.is_empty());
self.0 = ShortBoxSliceInner::ZeroOne(first);
} else {
debug_assert!(first.is_none());
self.0 = ShortBoxSliceInner::Multi(items.into_boxed_slice());
}
}
}
impl<'a, K: 'a, V: 'a> StoreIterable<'a, K, V> for ShortBoxSlice<(K, V)> {
type KeyValueIter =
core::iter::Map<core::slice::Iter<'a, (K, V)>, for<'r> fn(&'r (K, V)) -> (&'r K, &'r V)>;
fn lm_iter(&'a self) -> Self::KeyValueIter {
self.iter().map(|elt| (&elt.0, &elt.1))
}
}
#[cfg(feature = "alloc")]
impl<K, V> StoreFromIterator<K, V> for ShortBoxSlice<(K, V)> {}
#[cfg(feature = "alloc")]
impl<'a, K: 'a, V: 'a> StoreIterableMut<'a, K, V> for ShortBoxSlice<(K, V)> {
type KeyValueIterMut = core::iter::Map<
core::slice::IterMut<'a, (K, V)>,
for<'r> fn(&'r mut (K, V)) -> (&'r K, &'r mut V),
>;
fn lm_iter_mut(
&'a mut self,
) -> <Self as litemap::store::StoreIterableMut<'a, K, V>>::KeyValueIterMut {
self.iter_mut().map(|elt| (&elt.0, &mut elt.1))
}
}
#[cfg(feature = "alloc")]
impl<K, V> StoreIntoIterator<K, V> for ShortBoxSlice<(K, V)> {
type KeyValueIntoIter = ShortBoxSliceIntoIter<(K, V)>;
fn lm_into_iter(self) -> Self::KeyValueIntoIter {
self.into_iter()
}
// leave lm_extend_end as default
// leave lm_extend_start as default
}
#[test]
fn test_short_slice_impl() {
litemap::testing::check_store::<ShortBoxSlice<(u32, u64)>>();
}
#[test]
fn test_short_slice_impl_full() {
litemap::testing::check_store_full::<ShortBoxSlice<(u32, u64)>>();
}

View File

@@ -0,0 +1,404 @@
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
//! This module includes variable-length data types that are const-constructible for single
//! values and overflow to the heap.
//!
//! # Why?
//!
//! This module is far from the first stack-or-heap vector in the Rust ecosystem. It was created
//! with the following value proposition:
//!
//! 1. Enable safe const construction of stack collections.
//! 2. Avoid stack size penalties common with stack-or-heap collections.
//!
//! As of this writing, `heapless` and `tinyvec` don't support const construction except
//! for empty vectors, and `smallvec` supports it on unstable.
//!
//! Additionally, [`ShortBoxSlice`] has a smaller stack size than any of these:
//!
//! ```ignore
//! use core::mem::size_of;
//!
//! // NonZeroU64 has a niche that this module utilizes
//! use core::num::NonZeroU64;
//!
//! // ShortBoxSlice is the same size as `Box<[]>` for small or nichey values
//! assert_eq!(16, size_of::<shortvec::ShortBoxSlice::<NonZeroU64>>());
//!
//! // Note: SmallVec supports pushing and therefore has a capacity field
//! assert_eq!(24, size_of::<smallvec::SmallVec::<[NonZeroU64; 1]>>());
//!
//! // Note: heapless doesn't support spilling to the heap
//! assert_eq!(16, size_of::<heapless::Vec::<NonZeroU64, 1>>());
//!
//! // Note: TinyVec only supports types that implement `Default`
//! assert_eq!(24, size_of::<tinyvec::TinyVec::<[u64; 1]>>());
//! ```
//!
//! The module is `no_std` with `alloc`.
mod litemap;
#[cfg(feature = "alloc")]
use alloc::boxed::Box;
#[cfg(feature = "alloc")]
use alloc::vec;
#[cfg(feature = "alloc")]
use alloc::vec::Vec;
use core::ops::Deref;
use core::ops::DerefMut;
/// A boxed slice that supports no-allocation, constant values if length 0 or 1.
/// Using ZeroOne(Option<T>) saves 8 bytes in ShortBoxSlice via niche optimization.
#[derive(Debug, Clone, PartialEq, Eq, Hash, PartialOrd, Ord)]
pub(crate) enum ShortBoxSliceInner<T> {
ZeroOne(Option<T>),
#[cfg(feature = "alloc")]
Multi(Box<[T]>),
#[cfg(not(feature = "alloc"))]
Two([T; 2]),
}
impl<T> Default for ShortBoxSliceInner<T> {
fn default() -> Self {
use ShortBoxSliceInner::*;
ZeroOne(None)
}
}
/// A boxed slice that supports no-allocation, constant values if length 0 or 1.
///
/// Supports mutation but always reallocs when mutated.
#[derive(Debug, Clone, PartialEq, Eq, Hash, PartialOrd, Ord)]
pub(crate) struct ShortBoxSlice<T>(ShortBoxSliceInner<T>);
impl<T> Default for ShortBoxSlice<T> {
fn default() -> Self {
Self(Default::default())
}
}
impl<T> ShortBoxSlice<T> {
/// Creates a new, empty [`ShortBoxSlice`].
#[inline]
pub const fn new() -> Self {
use ShortBoxSliceInner::*;
Self(ZeroOne(None))
}
/// Creates a new [`ShortBoxSlice`] containing a single element.
#[inline]
pub const fn new_single(item: T) -> Self {
use ShortBoxSliceInner::*;
Self(ZeroOne(Some(item)))
}
pub fn new_double(first: T, second: T) -> Self {
use ShortBoxSliceInner::*;
#[cfg(feature = "alloc")]
return Self(Multi(vec![first, second].into_boxed_slice()));
#[cfg(not(feature = "alloc"))]
return Self(Two([first, second]));
}
/// Pushes an element onto this [`ShortBoxSlice`].
///
/// Reallocs if more than 1 item is already in the collection.
#[cfg(feature = "alloc")]
pub fn push(&mut self, item: T) {
use ShortBoxSliceInner::*;
self.0 = match core::mem::replace(&mut self.0, ZeroOne(None)) {
ZeroOne(None) => ZeroOne(Some(item)),
ZeroOne(Some(prev_item)) => Multi(vec![prev_item, item].into_boxed_slice()),
Multi(items) => {
let mut items = items.into_vec();
items.push(item);
Multi(items.into_boxed_slice())
}
};
}
/// Gets a single element from the [`ShortBoxSlice`].
///
/// Returns `None` if empty or more than one element.
#[inline]
pub const fn single(&self) -> Option<&T> {
use ShortBoxSliceInner::*;
match self.0 {
ZeroOne(Some(ref v)) => Some(v),
_ => None,
}
}
/// Destruct into a single element of the [`ShortBoxSlice`].
///
/// Returns `None` if empty or more than one element.
pub fn into_single(self) -> Option<T> {
use ShortBoxSliceInner::*;
match self.0 {
ZeroOne(Some(v)) => Some(v),
_ => None,
}
}
/// Returns the number of elements in the collection.
#[inline]
pub fn len(&self) -> usize {
use ShortBoxSliceInner::*;
match self.0 {
ZeroOne(None) => 0,
ZeroOne(_) => 1,
#[cfg(feature = "alloc")]
Multi(ref v) => v.len(),
#[cfg(not(feature = "alloc"))]
Two(_) => 2,
}
}
/// Returns whether the collection is empty.
#[inline]
pub const fn is_empty(&self) -> bool {
use ShortBoxSliceInner::*;
matches!(self.0, ZeroOne(None))
}
/// Inserts an element at the specified index into the collection.
///
/// Reallocs if more than 1 item is already in the collection.
#[cfg(feature = "alloc")]
pub fn insert(&mut self, index: usize, elt: T) {
use ShortBoxSliceInner::*;
assert!(
index <= self.len(),
"insertion index (is {}) should be <= len (is {})",
index,
self.len()
);
self.0 = match core::mem::replace(&mut self.0, ZeroOne(None)) {
ZeroOne(None) => ZeroOne(Some(elt)),
ZeroOne(Some(item)) => {
let items = if index == 0 {
vec![elt, item].into_boxed_slice()
} else {
vec![item, elt].into_boxed_slice()
};
Multi(items)
}
Multi(items) => {
let mut items = items.into_vec();
items.insert(index, elt);
Multi(items.into_boxed_slice())
}
}
}
/// Removes the element at the specified index from the collection.
///
/// Reallocs if more than 2 items are in the collection.
pub fn remove(&mut self, index: usize) -> T {
use ShortBoxSliceInner::*;
assert!(
index < self.len(),
"removal index (is {}) should be < len (is {})",
index,
self.len()
);
let (replaced, removed_item) = match core::mem::replace(&mut self.0, ZeroOne(None)) {
ZeroOne(None) => unreachable!(),
ZeroOne(Some(v)) => (ZeroOne(None), v),
#[cfg(feature = "alloc")]
Multi(v) => {
let mut v = v.into_vec();
let removed_item = v.remove(index);
match v.len() {
#[expect(clippy::unwrap_used)]
// we know that the vec has exactly one element left
1 => (ZeroOne(Some(v.pop().unwrap())), removed_item),
// v has at least 2 elements, create a Multi variant
_ => (Multi(v.into_boxed_slice()), removed_item),
}
}
#[cfg(not(feature = "alloc"))]
Two([f, s]) => (ZeroOne(Some(f)), s),
};
self.0 = replaced;
removed_item
}
/// Removes all elements from the collection.
#[inline]
pub fn clear(&mut self) {
use ShortBoxSliceInner::*;
let _ = core::mem::replace(&mut self.0, ZeroOne(None));
}
/// Retains only the elements specified by the predicate.
#[allow(dead_code)]
pub fn retain<F>(&mut self, mut f: F)
where
F: FnMut(&T) -> bool,
{
use ShortBoxSliceInner::*;
match core::mem::take(&mut self.0) {
ZeroOne(Some(one)) if f(&one) => self.0 = ZeroOne(Some(one)),
ZeroOne(_) => self.0 = ZeroOne(None),
#[cfg(feature = "alloc")]
Multi(slice) => {
let mut vec = slice.into_vec();
vec.retain(f);
*self = ShortBoxSlice::from(vec)
}
#[cfg(not(feature = "alloc"))]
Two([first, second]) => {
*self = match (Some(first).filter(&mut f), Some(second).filter(&mut f)) {
(None, None) => ShortBoxSlice::new(),
(None, Some(x)) | (Some(x), None) => ShortBoxSlice::new_single(x),
(Some(f), Some(s)) => ShortBoxSlice::new_double(f, s),
}
}
};
}
}
impl<T> Deref for ShortBoxSlice<T> {
type Target = [T];
fn deref(&self) -> &Self::Target {
use ShortBoxSliceInner::*;
match self.0 {
ZeroOne(None) => &[],
ZeroOne(Some(ref v)) => core::slice::from_ref(v),
#[cfg(feature = "alloc")]
Multi(ref v) => v,
#[cfg(not(feature = "alloc"))]
Two(ref v) => v,
}
}
}
impl<T> DerefMut for ShortBoxSlice<T> {
fn deref_mut(&mut self) -> &mut Self::Target {
use ShortBoxSliceInner::*;
match self.0 {
ZeroOne(None) => &mut [],
ZeroOne(Some(ref mut v)) => core::slice::from_mut(v),
#[cfg(feature = "alloc")]
Multi(ref mut v) => v,
#[cfg(not(feature = "alloc"))]
Two(ref mut v) => v,
}
}
}
#[cfg(feature = "alloc")]
impl<T> From<Vec<T>> for ShortBoxSlice<T> {
fn from(v: Vec<T>) -> Self {
use ShortBoxSliceInner::*;
match v.len() {
0 => Self(ZeroOne(None)),
#[expect(clippy::unwrap_used)] // we know that the vec is not empty
1 => Self(ZeroOne(Some(v.into_iter().next().unwrap()))),
_ => Self(Multi(v.into_boxed_slice())),
}
}
}
#[cfg(feature = "alloc")]
impl<T> FromIterator<T> for ShortBoxSlice<T> {
fn from_iter<I: IntoIterator<Item = T>>(iter: I) -> Self {
use ShortBoxSliceInner::*;
let mut iter = iter.into_iter();
match (iter.next(), iter.next()) {
(Some(first), Some(second)) => {
// Size hint behaviour same as `Vec::extend` + 2
let mut vec = Vec::with_capacity(iter.size_hint().0.saturating_add(3));
vec.push(first);
vec.push(second);
vec.extend(iter);
Self(Multi(vec.into_boxed_slice()))
}
(first, _) => Self(ZeroOne(first)),
}
}
}
/// An iterator that yields elements from a [`ShortBoxSlice`].
#[derive(Debug)]
pub struct ShortBoxSliceIntoIter<T>(ShortBoxSliceIntoIterInner<T>);
#[derive(Debug)]
pub(crate) enum ShortBoxSliceIntoIterInner<T> {
ZeroOne(Option<T>),
#[cfg(feature = "alloc")]
Multi(alloc::vec::IntoIter<T>),
#[cfg(not(feature = "alloc"))]
Two(core::array::IntoIter<T, 2>),
}
impl<T> Iterator for ShortBoxSliceIntoIter<T> {
type Item = T;
fn next(&mut self) -> Option<T> {
use ShortBoxSliceIntoIterInner::*;
match &mut self.0 {
ZeroOne(option) => option.take(),
#[cfg(feature = "alloc")]
Multi(into_iter) => into_iter.next(),
#[cfg(not(feature = "alloc"))]
Two(into_iter) => into_iter.next(),
}
}
}
impl<T> IntoIterator for ShortBoxSlice<T> {
type Item = T;
type IntoIter = ShortBoxSliceIntoIter<T>;
fn into_iter(self) -> Self::IntoIter {
match self.0 {
ShortBoxSliceInner::ZeroOne(option) => {
ShortBoxSliceIntoIter(ShortBoxSliceIntoIterInner::ZeroOne(option))
}
// TODO: Use a boxed slice IntoIter impl when available:
// <https://github.com/rust-lang/rust/issues/59878>
#[cfg(feature = "alloc")]
ShortBoxSliceInner::Multi(boxed_slice) => ShortBoxSliceIntoIter(
ShortBoxSliceIntoIterInner::Multi(boxed_slice.into_vec().into_iter()),
),
#[cfg(not(feature = "alloc"))]
ShortBoxSliceInner::Two(arr) => {
ShortBoxSliceIntoIter(ShortBoxSliceIntoIterInner::Two(arr.into_iter()))
}
}
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
#[expect(clippy::get_first)]
fn test_new_single_const() {
const MY_CONST_SLICE: ShortBoxSlice<i32> = ShortBoxSlice::new_single(42);
assert_eq!(MY_CONST_SLICE.len(), 1);
assert_eq!(MY_CONST_SLICE.get(0), Some(&42));
}
#[test]
#[expect(clippy::redundant_pattern_matching)]
fn test_get_single() {
let mut vec = ShortBoxSlice::new();
assert!(matches!(vec.single(), None));
vec.push(100);
assert!(matches!(vec.single(), Some(_)));
vec.push(200);
assert!(matches!(vec.single(), None));
}
}

View File

@@ -0,0 +1,59 @@
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
impl_tinystr_subtag!(
/// A language subtag (examples: `"en"`, `"csb"`, `"zh"`, `"und"`, etc.)
///
/// [`Language`] represents a Unicode base language code conformant to the
/// [`unicode_language_id`] field of the Language and Locale Identifier.
///
/// # Examples
///
/// ```
/// use icu::locale::subtags::Language;
///
/// let language: Language =
/// "en".parse().expect("Failed to parse a language subtag.");
/// ```
///
/// If the [`Language`] has no value assigned, it serializes to a string `"und"`, which
/// can be then parsed back to an empty [`Language`] field.
///
/// # Examples
///
/// ```
/// use icu::locale::subtags::Language;
///
/// assert_eq!(Language::UNKNOWN.as_str(), "und");
/// ```
///
/// `Notice`: ICU4X uses a narrow form of language subtag of 2-3 characters.
/// The specification allows language subtag to optionally also be 5-8 characters
/// but that form has not been used and ICU4X does not support it right now.
///
/// [`unicode_language_id`]: https://unicode.org/reports/tr35/#unicode_language_id
Language,
subtags,
language,
subtags_language,
2..=3,
s,
s.is_ascii_alphabetic(),
s.to_ascii_lowercase(),
s.is_ascii_alphabetic_lowercase(),
InvalidLanguage,
["en", "foo"],
["419", "german", "en1"],
);
impl Language {
/// The unknown language "und".
pub const UNKNOWN: Self = language!("und");
/// Whether this [`Language`] equals [`Language::UNKNOWN`].
#[inline]
pub const fn is_unknown(self) -> bool {
matches!(self, Self::UNKNOWN)
}
}

View File

@@ -0,0 +1,163 @@
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
//! Language Identifier and Locale contains a set of subtags
//! which represent different fields of the structure.
//!
//! * [`Language`] is the only mandatory field, which when empty,
//! takes the value `und`.
//! * [`Script`] is an optional field representing the written script used by the locale.
//! * [`Region`] is the region used by the locale.
//! * [`Variants`] is a list of optional [`Variant`] subtags containing information about the
//! variant adjustments used by the locale.
//!
//! Subtags can be used in isolation, and all basic operations such as parsing, syntax normalization
//! and serialization are supported on each individual subtag, but most commonly
//! they are used to construct a [`LanguageIdentifier`] instance.
//!
//! [`Variants`] is a special structure which contains a list of [`Variant`] subtags.
//! It is wrapped around to allow for sorting and deduplication of variants, which
//! is one of the required steps of language identifier and locale syntax normalization.
//!
//! # Examples
//!
//! ```
//! use icu::locale::subtags::{Language, Region, Script, Variant};
//!
//! let language: Language =
//! "en".parse().expect("Failed to parse a language subtag.");
//! let script: Script =
//! "arab".parse().expect("Failed to parse a script subtag.");
//! let region: Region =
//! "cn".parse().expect("Failed to parse a region subtag.");
//! let variant: Variant =
//! "MacOS".parse().expect("Failed to parse a variant subtag.");
//!
//! assert_eq!(language.as_str(), "en");
//! assert_eq!(script.as_str(), "Arab");
//! assert_eq!(region.as_str(), "CN");
//! assert_eq!(variant.as_str(), "macos");
//! ```
//!
//! `Notice`: The subtags are normalized on parsing. That means
//! that all operations work on a normalized version of the subtag
//! and serialization is very cheap.
//!
//! [`LanguageIdentifier`]: super::LanguageIdentifier
mod language;
mod region;
mod script;
mod variant;
mod variants;
#[doc(inline)]
pub use language::{language, Language};
#[doc(inline)]
pub use region::{region, Region};
#[doc(inline)]
pub use script::{script, Script};
#[doc(inline)]
pub use variant::{variant, Variant};
pub use variants::Variants;
impl_tinystr_subtag!(
/// A generic subtag.
///
/// The subtag has to be an ASCII alphanumerical string no shorter than
/// two characters and no longer than eight.
///
/// # Examples
///
/// ```
/// use icu::locale::subtags::Subtag;
///
/// let subtag1: Subtag = "Foo".parse()
/// .expect("Failed to parse a Subtag.");
///
/// assert_eq!(subtag1.as_str(), "foo");
/// ```
Subtag,
subtags,
subtag,
subtags_subtag,
2..=8,
s,
s.is_ascii_alphanumeric(),
s.to_ascii_lowercase(),
s.is_ascii_alphanumeric() && s.is_ascii_lowercase(),
InvalidSubtag,
["foo12"],
["f", "toolooong"],
);
#[expect(clippy::len_without_is_empty)]
impl Subtag {
#[allow(dead_code)]
pub(crate) const fn valid_key(v: &[u8]) -> bool {
2 <= v.len() && v.len() <= 8
}
/// Returns the length of `self`.
///
/// # Examples
///
/// ```
/// use icu::locale::subtags::subtag;
/// let s = subtag!("foo");
/// assert_eq!(s.len(), 3);
/// ```
pub fn len(&self) -> usize {
self.0.len()
}
#[doc(hidden)]
pub fn from_tinystr_unvalidated(input: tinystr::TinyAsciiStr<8>) -> Self {
Self(input)
}
#[doc(hidden)]
pub fn as_tinystr(&self) -> tinystr::TinyAsciiStr<8> {
self.0
}
#[allow(dead_code)]
pub(crate) fn to_ascii_lowercase(self) -> Self {
Self(self.0.to_ascii_lowercase())
}
}
impl<const N: usize> TryFrom<tinystr::TinyAsciiStr<N>> for Subtag {
type Error = crate::parser::errors::ParseError;
fn try_from(value: tinystr::TinyAsciiStr<N>) -> Result<Self, Self::Error> {
Self::try_from_str(&value)
}
}
impl PartialEq<str> for Subtag {
fn eq(&self, other: &str) -> bool {
self.0 == other
}
}
#[cfg(test)]
mod tests {
use super::*;
use tinystr::tinystr;
#[test]
fn test_subtag() {
let subtag = subtag!("foo");
assert_eq!(subtag.as_str(), "foo");
}
#[test]
fn test_subtag_from_tinystr() {
let subtag = Subtag::try_from(tinystr!(3, "foo"));
assert!(subtag.is_ok());
let subtag = Subtag::try_from(tinystr!(1, "f"));
assert!(subtag.is_err());
}
}

View File

@@ -0,0 +1,60 @@
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
impl_tinystr_subtag!(
/// A region subtag (examples: `"US"`, `"CN"`, `"AR"` etc.)
///
/// [`Region`] represents a Unicode base language code conformant to the
/// [`unicode_region_id`] field of the Language and Locale Identifier.
///
/// # Examples
///
/// ```
/// use icu::locale::subtags::Region;
///
/// let region: Region =
/// "DE".parse().expect("Failed to parse a region subtag.");
/// ```
///
/// [`unicode_region_id`]: https://unicode.org/reports/tr35/#unicode_region_id
Region,
subtags,
region,
subtags_region,
2..=3,
s,
if s.len() == 2 {
s.is_ascii_alphabetic()
} else {
s.is_ascii_numeric()
},
if s.len() == 2 {
s.to_ascii_uppercase()
} else {
s
},
if s.len() == 2 {
s.is_ascii_alphabetic_uppercase()
} else {
s.is_ascii_numeric()
},
InvalidSubtag,
["FR", "123"],
["12", "FRA", "b2"],
);
impl Region {
/// Returns true if the Region has an alphabetic code.
///
/// # Examples
///
/// ```
/// use icu::locale::subtags::region;
///
/// assert!(region!("us").is_alphabetic());
/// ```
pub fn is_alphabetic(&self) -> bool {
self.0.len() == 2
}
}

View File

@@ -0,0 +1,41 @@
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
use crate::subtags::Subtag;
impl_tinystr_subtag!(
/// A script subtag (examples: `"Latn"`, `"Arab"`, etc.)
///
/// [`Script`] represents a Unicode base language code conformant to the
/// [`unicode_script_id`] field of the Language and Locale Identifier.
///
/// # Examples
///
/// ```
/// use icu::locale::subtags::Script;
///
/// let script: Script =
/// "Latn".parse().expect("Failed to parse a script subtag.");
/// ```
///
/// [`unicode_script_id`]: https://unicode.org/reports/tr35/#unicode_script_id
Script,
subtags,
script,
subtags_script,
4..=4,
s,
s.is_ascii_alphabetic(),
s.to_ascii_titlecase(),
s.is_ascii_alphabetic_titlecase(),
InvalidSubtag,
["Latn"],
["Latin"],
);
impl From<Script> for Subtag {
fn from(value: Script) -> Self {
Subtag(value.0.resize())
}
}

View File

@@ -0,0 +1,35 @@
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
impl_tinystr_subtag!(
/// A variant subtag (examples: `"macos"`, `"posix"`, `"1996"` etc.)
///
/// [`Variant`] represents a Unicode base language code conformant to the
/// [`unicode_variant_id`] field of the Language and Locale Identifier.
///
/// # Examples
///
/// ```
/// use icu::locale::subtags::Variant;
///
/// let variant: Variant =
/// "macos".parse().expect("Failed to parse a variant subtag.");
/// ```
///
/// [`unicode_variant_id`]: https://unicode.org/reports/tr35/#unicode_variant_id
Variant,
subtags,
variant,
subtags_variant,
4..=8,
s,
s.is_ascii_alphanumeric() && (s.len() != 4 || s.all_bytes()[0].is_ascii_digit()),
s.to_ascii_lowercase(),
s.is_ascii_lowercase()
&& s.is_ascii_alphanumeric()
&& (s.len() != 4 || s.all_bytes()[0].is_ascii_digit()),
InvalidSubtag,
["posix", "1996"],
["yes"],
);

View File

@@ -0,0 +1,138 @@
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
use super::Variant;
use crate::shortvec::ShortBoxSlice;
#[cfg(feature = "alloc")]
use alloc::vec::Vec;
use core::ops::Deref;
/// A list of variants (examples: `["macos", "posix"]`, etc.)
///
/// [`Variants`] stores a list of [`Variant`] subtags in a canonical form
/// by sorting and deduplicating them.
///
/// # Examples
///
/// ```
/// use icu::locale::subtags::{variant, Variants};
///
/// let mut v = vec![variant!("posix"), variant!("macos")];
/// v.sort();
/// v.dedup();
///
/// let variants: Variants = Variants::from_vec_unchecked(v);
/// assert_eq!(variants.to_string(), "macos-posix");
/// ```
#[derive(Default, Debug, PartialEq, Eq, Clone, Hash, PartialOrd, Ord)]
pub struct Variants(ShortBoxSlice<Variant>);
impl Variants {
/// Returns a new empty list of variants. Same as [`default()`](Default::default()), but is `const`.
///
/// # Examples
///
/// ```
/// use icu::locale::subtags::Variants;
///
/// assert_eq!(Variants::new(), Variants::default());
/// ```
#[inline]
pub const fn new() -> Self {
Self(ShortBoxSlice::new())
}
/// Creates a new [`Variants`] set from a single [`Variant`].
///
/// # Examples
///
/// ```
/// use icu::locale::subtags::{variant, Variants};
///
/// let variants = Variants::from_variant(variant!("posix"));
/// ```
#[inline]
pub const fn from_variant(variant: Variant) -> Self {
Self(ShortBoxSlice::new_single(variant))
}
/// Creates a new [`Variants`] set from a [`Vec`].
/// The caller is expected to provide sorted and deduplicated vector as
/// an input.
///
/// ✨ *Enabled with the `alloc` Cargo feature.*
///
/// # Examples
///
/// ```
/// use icu::locale::subtags::{variant, Variants};
///
/// let mut v = vec![variant!("posix"), variant!("macos")];
/// v.sort();
/// v.dedup();
///
/// let variants = Variants::from_vec_unchecked(v);
/// ```
///
/// Notice: For performance- and memory-constrained environments, it is recommended
/// for the caller to use [`binary_search`](slice::binary_search) instead of [`sort`](slice::sort)
/// and [`dedup`](Vec::dedup()).
#[cfg(feature = "alloc")]
pub fn from_vec_unchecked(input: Vec<Variant>) -> Self {
Self(input.into())
}
#[cfg(feature = "alloc")]
pub(crate) fn from_short_slice_unchecked(input: ShortBoxSlice<Variant>) -> Self {
Self(input)
}
/// Empties the [`Variants`] list.
///
/// Returns the old list.
///
/// # Examples
///
/// ```
/// use icu::locale::subtags::{variant, Variants};
///
/// let mut v = vec![variant!("posix"), variant!("macos")];
/// v.sort();
/// v.dedup();
///
/// let mut variants: Variants = Variants::from_vec_unchecked(v);
///
/// assert_eq!(variants.to_string(), "macos-posix");
///
/// variants.clear();
///
/// assert_eq!(variants, Variants::default());
/// ```
pub fn clear(&mut self) -> Self {
core::mem::take(self)
}
/// Whether the list of variants is empty.
pub const fn is_empty(&self) -> bool {
self.0.is_empty()
}
pub(crate) fn for_each_subtag_str<E, F>(&self, f: &mut F) -> Result<(), E>
where
F: FnMut(&str) -> Result<(), E>,
{
self.deref().iter().map(|t| t.as_str()).try_for_each(f)
}
}
impl_writeable_for_subtag_list!(Variants, "macos", "posix");
impl Deref for Variants {
type Target = [Variant];
fn deref(&self) -> &[Variant] {
self.0.deref()
}
}

131
vendor/icu_locale_core/src/zerovec.rs vendored Normal file
View File

@@ -0,0 +1,131 @@
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
//! Documentation on zero-copy deserialization of locale types.
//!
//! [`Locale`] and [`LanguageIdentifier`] are highly structured types that cannot be directly
//! stored in a zero-copy data structure, such as those provided by the [`zerovec`](crate::zerovec) module.
//! This page explains how to indirectly store these types in a [`zerovec`](crate::zerovec).
//!
//! There are two main use cases, which have different solutions:
//!
//! 1. **Lookup:** You need to locate a locale in a zero-copy vector, such as when querying a map.
//! 2. **Obtain:** You have a locale stored in a zero-copy vector, and you need to obtain a proper
//! [`Locale`] or [`LanguageIdentifier`] for use elsewhere in your program.
//!
//! # Lookup
//!
//! To perform lookup, store the stringified locale in a canonical BCP-47 form as a byte array,
//! and then use [`Locale::strict_cmp()`] to perform an efficient, zero-allocation lookup.
//!
//! To produce more human-readable serialized output, you can use `PotentialUtf8`.
//!
//! ```
//! use icu::locale::Locale;
//! use potential_utf::PotentialUtf8;
//! use zerovec::ZeroMap;
//!
//! // ZeroMap from locales to integers
//! let data: &[(&PotentialUtf8, u32)] = &[
//! ("de-DE-u-hc-h12".into(), 5),
//! ("en-US-u-ca-buddhist".into(), 10),
//! ("my-MM".into(), 15),
//! ("sr-Cyrl-ME".into(), 20),
//! ("zh-TW".into(), 25),
//! ];
//! let zm: ZeroMap<PotentialUtf8, u32> = data.iter().copied().collect();
//!
//! // Get the value associated with a locale
//! let loc: Locale = "en-US-u-ca-buddhist".parse().unwrap();
//! let value = zm.get_copied_by(|uvstr| loc.strict_cmp(uvstr).reverse());
//! assert_eq!(value, Some(10));
//! ```
//!
//! # Obtain
//!
//! Obtaining a [`Locale`] or [`LanguageIdentifier`] is not generally a zero-copy operation, since
//! both of these types may require memory allocation. If possible, architect your code such that
//! you do not need to obtain a structured type.
//!
//! If you need the structured type, such as if you need to manipulate it in some way, there are two
//! options: storing subtags, and storing a string for parsing.
//!
//! ## Storing Subtags
//!
//! If the data being stored only contains a limited number of subtags, you can store them as a
//! tuple, and then construct the [`LanguageIdentifier`] externally.
//!
//! ```
//! use icu::locale::subtags::{Language, Region, Script};
//! use icu::locale::LanguageIdentifier;
//! use icu::locale::{
//! langid,
//! subtags::{language, region, script},
//! };
//! use zerovec::ZeroMap;
//!
//! // ZeroMap from integer to LSR (language-script-region)
//! let zm: ZeroMap<u32, (Language, Option<Script>, Option<Region>)> = [
//! (5, (language!("de"), None, Some(region!("DE")))),
//! (10, (language!("en"), None, Some(region!("US")))),
//! (15, (language!("my"), None, Some(region!("MM")))),
//! (
//! 20,
//! (language!("sr"), Some(script!("Cyrl")), Some(region!("ME"))),
//! ),
//! (25, (language!("zh"), None, Some(region!("TW")))),
//! ]
//! .into_iter()
//! .collect();
//!
//! // Construct a LanguageIdentifier from a tuple entry
//! let lid: LanguageIdentifier =
//! zm.get_copied(&25).expect("element is present").into();
//!
//! assert_eq!(lid, langid!("zh-TW"));
//! ```
//!
//! ## Storing Strings
//!
//! If it is necessary to store and obtain an arbitrary locale, it is currently recommended to
//! store a BCP-47 string and parse it when needed.
//!
//! Since the string is stored in an unparsed state, it is not safe to `unwrap` the result from
//! `Locale::try_from_utf8()`. See [icu4x#831](https://github.com/unicode-org/icu4x/issues/831)
//! for a discussion on potential data models that could ensure that the locale is valid during
//! deserialization.
//!
//! As above, to produce more human-readable serialized output, you can use `PotentialUtf8`.
//!
//! ```
//! use icu::locale::langid;
//! use icu::locale::Locale;
//! use potential_utf::PotentialUtf8;
//! use zerovec::ZeroMap;
//!
//! // ZeroMap from integer to locale string
//! let data: &[(u32, &PotentialUtf8)] = &[
//! (5, "de-DE-u-hc-h12".into()),
//! (10, "en-US-u-ca-buddhist".into()),
//! (15, "my-MM".into()),
//! (20, "sr-Cyrl-ME".into()),
//! (25, "zh-TW".into()),
//! (30, "INVALID".into()),
//! ];
//! let zm: ZeroMap<u32, PotentialUtf8> = data.iter().copied().collect();
//!
//! // Construct a Locale by parsing the string.
//! let value = zm.get(&25).expect("element is present");
//! let loc = Locale::try_from_utf8(value);
//! assert_eq!(loc, Ok(langid!("zh-TW").into()));
//!
//! // Invalid entries are fallible
//! let err_value = zm.get(&30).expect("element is present");
//! let err_loc = Locale::try_from_utf8(err_value);
//! assert!(err_loc.is_err());
//! ```
//!
//! [`Locale`]: crate::Locale
//! [`Locale::strict_cmp()`]: crate::Locale::strict_cmp()
//! [`LanguageIdentifier`]: crate::LanguageIdentifier

View File

@@ -0,0 +1,68 @@
[
{
"input": "Pl",
"output": "pl"
},
{
"input": "eN-uS",
"output": "en-US"
},
{
"input": "ZH-hans-hK",
"output": "zh-Hans-HK"
},
{
"input": "en-scouse-fonipa",
"output": "en-fonipa-scouse"
},
{
"input": {
"type": "Locale",
"identifier": "en-US-t-es-AR-x-foo"
},
"output": {
"type": "Locale",
"identifier": "en-US-t-es-ar-x-foo"
}
},
{
"input": {
"type": "Locale",
"identifier": "en-t-en-Latn-CA-emodeng"
},
"output": {
"type": "Locale",
"identifier": "en-t-en-latn-ca-emodeng"
}
},
{
"input": {
"type": "Locale",
"identifier": "EN-US-T-ES-AR-X-FOO"
},
"output": {
"type": "Locale",
"identifier": "en-US-t-es-ar-x-foo"
}
},
{
"input": {
"type": "Locale",
"identifier": "EN-T-EN-LATN-CA-EMODENG"
},
"output": {
"type": "Locale",
"identifier": "en-t-en-latn-ca-emodeng"
}
},
{
"input": {
"type": "Locale",
"identifier": "UND-CYRL-T-ES-LATN-M0-UNGEGN"
},
"output": {
"type": "Locale",
"identifier": "und-Cyrl-t-es-latn-m0-ungegn"
}
}
]

View File

@@ -0,0 +1,162 @@
[
{
"input": {
"type": "Locale",
"identifier": "cmn-hans-cn-t-ca-u-ca-x_t-u"
},
"output": {
"error": "InvalidExtension",
"text": "unused"
}
},
{
"input": {
"type": "Locale",
"identifier": "pl-US-x-waytoolongkey"
},
"output": {
"error": "InvalidExtension",
"text": "Invalid subtag"
}
},
{
"input": {
"type": "Locale",
"identifier": "pl-US-x-@A-3"
},
"output": {
"error": "InvalidExtension",
"text": "Invalid subtag"
}
},
{
"input": {
"type": "Locale",
"identifier": "pl-US-t-h0"
},
"output": {
"error": "InvalidExtension",
"text": "Invalid subtag"
}
},
{
"input": {
"type": "Locale",
"identifier": "pl-US-t-h0-x-foo"
},
"output": {
"error": "InvalidExtension",
"text": "Invalid subtag"
}
},
{
"input": {
"type": "Locale",
"identifier": "pl-US-t-h0"
},
"output": {
"error": "InvalidExtension",
"text": "Invalid subtag"
}
},
{
"input": {
"type": "Locale",
"identifier": "und-t-m0"
},
"output": {
"error": "InvalidExtension",
"text": "Invalid subtag"
}
},
{
"input": {
"type": "Locale",
"identifier": "und-t-m0-n0-mixed"
},
"output": {
"error": "InvalidExtension",
"text": "Invalid subtag"
}
},
{
"input": {
"type": "Locale",
"identifier": "da-u"
},
"output": {
"error": "InvalidExtension",
"text": "Invalid subtag"
}
},
{
"input": {
"type": "Locale",
"identifier": "da-u--"
},
"output": {
"error": "InvalidExtension",
"text": "Invalid subtag"
}
},
{
"input": {
"type": "Locale",
"identifier": "da-u-t-latn"
},
"output": {
"error": "InvalidExtension",
"text": "Invalid subtag"
}
},
{
"input": {
"type": "Locale",
"identifier": "cmn-hans-cn-u-u"
},
"output": {
"error": "InvalidExtension",
"text": "Invalid subtag"
}
},
{
"input": {
"type": "Locale",
"identifier": "de-u-ca-"
},
"output": {
"error": "InvalidExtension",
"text": "Invalid subtag"
}
},
{
"input": {
"type": "Locale",
"identifier": "de-u-ca-gregory-"
},
"output": {
"error": "InvalidExtension",
"text": "Invalid subtag"
}
},
{
"input": {
"type": "Locale",
"identifier": "de-u-ca-gregory-u-hc-hc24"
},
"output": {
"error": "DuplicatedExtension",
"text": "Duplicated extension"
}
},
{
"input": {
"type": "Locale",
"identifier": "de-l-foo-l-bar"
},
"output": {
"error": "DuplicatedExtension",
"text": "Duplicated extension"
}
}
]

View File

@@ -0,0 +1,142 @@
[
{
"input": "-",
"output": {
"error": "InvalidLanguage",
"text": "The given language subtag is invalid"
}
},
{
"input": "--",
"output": {
"error": "InvalidLanguage",
"text": "The given subtag is invalid"
}
},
{
"input": "en-",
"output": {
"error": "InvalidSubtag",
"text": "The given subtag is invalid"
}
},
{
"input": "-en",
"output": {
"error": "InvalidLanguage",
"text": "The given subtag is invalid"
}
},
{
"input": "en-us-",
"output": {
"error": "InvalidSubtag",
"text": "The given subtag is invalid"
}
},
{
"input": "en_us",
"output": {
"error": "InvalidLanguage",
"text": "The given language subtag is invalid"
}
},
{
"input": "en--US",
"output": {
"error": "InvalidSubtag",
"text": "The given subtag is invalid"
}
},
{
"input": "-e-",
"output": {
"error": "InvalidLanguage",
"text": "The given subtag is invalid"
}
},
{
"input": "a1a",
"output": {
"error": "InvalidLanguage",
"text": "The given language subtag is invalid"
}
},
{
"input": "Arab-US",
"output": {
"error": "InvalidLanguage",
"text": "The given language subtag is invalid"
}
},
{
"input": "",
"output": {
"error": "InvalidLanguage",
"text": "The given language subtag is invalid"
}
},
{
"input": "pl-DSDAFAFDF",
"output": {
"error": "InvalidSubtag",
"text": "Invalid subtag"
}
},
{
"input": "pl-Latn-$1231",
"output": {
"error": "InvalidSubtag",
"text": "Invalid subtag"
}
},
{
"input": "pl-Latn-US-$1231",
"output": {
"error": "InvalidSubtag",
"text": "Invalid subtag"
}
},
{
"input": "pl-Latn-12",
"output": {
"error": "InvalidSubtag",
"text": "Invalid subtag"
}
},
{
"input": "pl-Latn-a12",
"output": {
"error": "InvalidSubtag",
"text": "Invalid subtag"
}
},
{
"input": "pl-Latn-US-3-dd",
"output": {
"error": "InvalidSubtag",
"text": "Invalid subtag"
}
},
{
"input": {
"type": "Locale",
"identifier": "pl-Latn-US-variant-h0-hybrid"
},
"output": {
"error": "InvalidSubtag",
"text": "Invalid subtag"
}
},
{
"input": {
"type": "Locale",
"identifier": "en-variant-emodeng-emodeng"
},
"output": {
"error": "InvalidSubtag",
"text": "Invalid subtag"
}
}
]

View File

@@ -0,0 +1,167 @@
[
{
"input": "en",
"output": {
"type": "LanguageIdentifier",
"language": "en"
}
},
{
"input": "lij",
"output": {
"type": "LanguageIdentifier",
"language": "lij"
}
},
{
"input": "en-Latn",
"output": {
"type": "LanguageIdentifier",
"language": "en",
"script": "Latn"
}
},
{
"input": "lij-Arab",
"output": {
"type": "LanguageIdentifier",
"language": "lij",
"script": "Arab"
}
},
{
"input": "en-Latn-US",
"output": {
"type": "LanguageIdentifier",
"language": "en",
"script": "Latn",
"region": "US"
}
},
{
"input": "lij-Arab-FA",
"output": {
"type": "LanguageIdentifier",
"language": "lij",
"script": "Arab",
"region": "FA"
}
},
{
"input": "en-Latn-US-windows",
"output": {
"type": "LanguageIdentifier",
"language": "en",
"script": "Latn",
"region": "US",
"variants": ["windows"]
}
},
{
"input": "lij-Arab-FA-linux",
"output": {
"type": "LanguageIdentifier",
"language": "lij",
"script": "Arab",
"region": "FA",
"variants": ["linux"]
}
},
{
"input": "lij-Arab-FA-linux-nedis",
"output": {
"type": "LanguageIdentifier",
"language": "lij",
"script": "Arab",
"region": "FA",
"variants": ["linux", "nedis"]
}
},
{
"input": "EN-latn-us",
"output": {
"type": "LanguageIdentifier",
"language": "en",
"script": "Latn",
"region": "US"
}
},
{
"input": "sl-nedis",
"output": {
"type": "LanguageIdentifier",
"language": "sl",
"variants": ["nedis"]
}
},
{
"input": "de-CH-1996",
"output": {
"type": "LanguageIdentifier",
"language": "de",
"region": "CH",
"variants": ["1996"]
}
},
{
"input": "sr-Latn",
"output": {
"type": "LanguageIdentifier",
"language": "sr",
"script": "Latn"
}
},
{
"input": "es-419",
"output": {
"type": "LanguageIdentifier",
"language": "es",
"region": "419"
}
},
{
"input": "und-Latn-US",
"output": {
"type": "LanguageIdentifier",
"script": "Latn",
"region": "US"
}
},
{
"input": "und",
"output": {
"type": "LanguageIdentifier"
}
},
{
"input": "und-Latn",
"output": {
"type": "LanguageIdentifier",
"script": "Latn"
}
},
{
"input": "pl-macos-Windows-nedis-aRabic",
"output": {
"type": "LanguageIdentifier",
"language": "pl",
"variants": ["arabic", "macos", "nedis", "windows"]
}
},
{
"input": "und-Latn-macos",
"output": {
"type": "LanguageIdentifier",
"script": "Latn",
"variants": ["macos"]
}
},
{
"input": "und-Latn-312",
"output": {
"type": "LanguageIdentifier",
"script": "Latn",
"region": "312"
}
}
]

View File

@@ -0,0 +1,298 @@
[
{
"input": {
"type": "Locale",
"identifier": "en-u-hc-h12"
},
"output": {
"type": "Locale",
"language": "en",
"extensions": {
"unicode": {
"keywords": {
"hc": "h12"
}
}
}
}
},
{
"input": {
"type": "Locale",
"identifier": "en-US-u-hc-h23"
},
"output": {
"type": "Locale",
"language": "en",
"region": "US",
"extensions": {
"unicode": {
"keywords": {
"hc": "h23"
}
}
}
}
},
{
"input": {
"type": "Locale",
"identifier": "en-US-u-foo"
},
"output": {
"type": "Locale",
"language": "en",
"region": "US",
"extensions": {
"unicode": {
"attributes": [
"foo"
]
}
}
}
},
{
"input": {
"type": "Locale",
"identifier": "en-US-u-hc-h23-ca-islamic-civil-ss-true"
},
"output": {
"type": "Locale",
"language": "en",
"region": "US",
"extensions": {
"unicode": {
"keywords": {
"hc": "h23",
"ca": "islamic-civil",
"ss": "true"
}
}
}
}
},
{
"input": {
"type": "Locale",
"identifier": "en-US-t-pl-latn-de"
},
"output": {
"type": "Locale",
"language": "en",
"region": "US",
"extensions": {
"transform": {
"tlang": "pl-Latn-DE"
}
}
}
},
{
"input": {
"type": "Locale",
"identifier": "en-US-x-private-foobar"
},
"output": {
"type": "Locale",
"language": "en",
"region": "US",
"extensions": {
"private": ["private", "foobar"]
}
}
},
{
"input": {
"type": "Locale",
"identifier": "en-US-t-h0-hybrid-k0-platform-s0-true"
},
"output": {
"type": "Locale",
"language": "en",
"region": "US",
"extensions": {
"transform": {
"tfields": {
"h0": "hybrid",
"k0": "platform",
"s0": "true"
}
}
}
}
},
{
"input": {
"type": "Locale",
"identifier": "en-US-t-es-ar-x-foo"
},
"output": {
"type": "Locale",
"language": "en",
"region": "US",
"extensions": {
"transform": {
"tlang": "es-AR"
},
"private": ["foo"]
}
}
},
{
"input": {
"type": "Locale",
"identifier": "en-US-u-ca-buddhist-hc-h12-t-es-ar-h0-hybrid-x-private-foobar"
},
"output": {
"type": "Locale",
"language": "en",
"region": "US",
"extensions": {
"unicode": {
"keywords": {
"ca": "buddhist",
"hc": "h12"
}
},
"transform": {
"tlang": "es-AR",
"tfields": {
"h0": "hybrid"
}
},
"private": ["private", "foobar"]
}
}
},
{
"input": {
"type": "Locale",
"language": "es",
"region": "MX",
"extensions": {
"unicode": {
"keywords": {
"ca": "islamic",
"co": "search",
"nu": "roman"
}
}
}
},
"output": {
"type": "Locale",
"identifier": "es-MX-u-ca-islamic-co-search-nu-roman"
}
},
{
"input": {
"type": "Locale",
"identifier": "und-u-kn"
},
"output": {
"type": "Locale",
"identifier": "und-u-kn"
}
},
{
"input": {
"type": "Locale",
"identifier": "und-u-kn-ca-calendar"
},
"output": {
"type": "Locale",
"identifier": "und-u-ca-calendar-kn"
}
},
{
"input": {
"type": "Locale",
"identifier": "und-u-kn-nu-arab"
},
"output": {
"type": "Locale",
"identifier": "und-u-kn-nu-arab"
}
},
{
"input": {
"type": "Locale",
"identifier": "und-t-m0-true"
},
"output": {
"type": "Locale",
"identifier": "und-t-m0-true"
}
},
{
"input": {
"type": "Locale",
"identifier": "und-t-m0-true-n0-mixed"
},
"output": {
"type": "Locale",
"identifier": "und-t-m0-true-n0-mixed"
}
},
{
"input": {
"type": "Locale",
"identifier": "und-t-m0-true-c0-mixed"
},
"output": {
"type": "Locale",
"identifier": "und-t-c0-mixed-m0-true"
}
},
{
"input": {
"type": "Locale",
"identifier": "da-u-ca-gregory-ca-buddhist"
},
"output": {
"type": "Locale",
"identifier": "da-u-ca-gregory"
}
},
{
"input": {
"type": "Locale",
"identifier": "pt-u-attr2-attr1-ca-gregory"
},
"output": {
"type": "Locale",
"identifier": "pt-u-attr1-attr2-ca-gregory"
}
},
{
"input": {
"type": "Locale",
"identifier": "pt-u-attr1-attr2-attr1-ca-gregory"
},
"output": {
"type": "Locale",
"identifier": "pt-u-attr1-attr2-ca-gregory"
}
},
{
"input": {
"type": "Locale",
"identifier": "en-a-not-assigned"
},
"output": {
"type": "Locale",
"identifier": "en-a-not-assigned"
}
},
{
"input": {
"type": "Locale",
"identifier": "en-w-bar-u-foo-a-bar-x-u-foo"
},
"output": {
"type": "Locale",
"identifier": "en-a-bar-u-foo-w-bar-x-u-foo"
}
}
]

View File

@@ -0,0 +1,254 @@
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
use std::collections::HashMap;
use std::convert::{TryFrom, TryInto};
use icu_locale_core::extensions::private;
use icu_locale_core::extensions::transform;
use icu_locale_core::extensions::unicode;
use icu_locale_core::extensions::Extensions;
use icu_locale_core::{subtags, LanguageIdentifier, Locale, ParseError};
use serde::Deserialize;
#[derive(Debug, Deserialize, Clone)]
pub struct LocaleIdentifier {
#[serde(rename = "type")]
pub field_type: String,
pub identifier: String,
}
#[derive(Debug, Deserialize, Clone)]
pub struct LocaleExtensionUnicode {
#[serde(default)]
keywords: HashMap<String, Option<String>>,
#[serde(default)]
attributes: Vec<String>,
}
#[derive(Debug, Deserialize, Clone)]
pub struct LocaleExtensionTransform {
tlang: Option<String>,
#[serde(default)]
tfields: HashMap<String, Option<String>>,
}
#[derive(Debug, Deserialize, Clone)]
pub struct LocaleExtensions {
unicode: Option<LocaleExtensionUnicode>,
transform: Option<LocaleExtensionTransform>,
#[serde(default)]
private: Vec<String>,
_other: Option<String>,
}
impl TryFrom<LocaleExtensions> for Extensions {
type Error = ParseError;
fn try_from(input: LocaleExtensions) -> Result<Self, Self::Error> {
let mut ext = Extensions::default();
if let Some(unicode) = input.unicode {
ext.unicode.keywords = unicode
.keywords
.iter()
.map(|(k, v)| {
(
unicode::Key::try_from_str(k).expect("Parsing key failed."),
v.as_ref().map_or(
unicode::Value::try_from_str("").expect("Failed to parse Value"),
|v| unicode::Value::try_from_str(v).expect("Parsing type failed."),
),
)
})
.collect();
let v: Vec<unicode::Attribute> = unicode
.attributes
.iter()
.map(|v| unicode::Attribute::try_from_str(v).expect("Parsing attribute failed."))
.collect();
ext.unicode.attributes = unicode::Attributes::from_vec_unchecked(v);
}
if let Some(transform) = input.transform {
ext.transform.fields = transform
.tfields
.iter()
.map(|(k, v)| {
(
transform::Key::try_from_str(k).expect("Parsing key failed."),
v.as_ref()
.map(|v| {
transform::Value::try_from_str(v).expect("Parsing value failed.")
})
.expect("Value cannot be empty."),
)
})
.collect();
if let Some(tlang) = transform.tlang {
ext.transform.lang = Some(tlang.parse().expect("Failed to parse tlang."));
}
}
let v: Vec<private::Subtag> = input
.private
.iter()
.map(|v| private::Subtag::try_from_str(v).expect("Failed to add field."))
.collect();
ext.private = private::Private::from_vec_unchecked(v);
Ok(ext)
}
}
#[derive(Debug, Deserialize, Clone)]
pub struct LocaleSubtags {
#[serde(rename = "type")]
pub field_type: String,
pub language: Option<String>,
pub script: Option<String>,
pub region: Option<String>,
#[serde(default)]
pub variants: Vec<String>,
pub extensions: Option<LocaleExtensions>,
}
#[derive(Debug, Deserialize, Clone)]
pub struct LocaleError {
pub error: String,
pub text: String,
}
#[derive(Debug, Deserialize, Clone)]
#[serde(untagged)]
#[expect(clippy::large_enum_variant)] // test code
pub enum LocaleInfo {
String(String),
Error(LocaleError),
Identifier(LocaleIdentifier),
Object(LocaleSubtags),
}
impl TryFrom<LocaleInfo> for LanguageIdentifier {
type Error = ParseError;
fn try_from(input: LocaleInfo) -> Result<Self, Self::Error> {
match input {
LocaleInfo::String(s) => s.parse(),
LocaleInfo::Error(e) => Err(e.into()),
LocaleInfo::Identifier(ident) => ident.try_into(),
LocaleInfo::Object(o) => o.try_into(),
}
}
}
impl TryFrom<LocaleInfo> for Locale {
type Error = ParseError;
fn try_from(input: LocaleInfo) -> Result<Self, Self::Error> {
match input {
LocaleInfo::String(s) => s.parse(),
LocaleInfo::Error(e) => Err(e.into()),
LocaleInfo::Identifier(ident) => ident.try_into(),
LocaleInfo::Object(o) => o.try_into(),
}
}
}
impl TryFrom<LocaleIdentifier> for LanguageIdentifier {
type Error = ParseError;
fn try_from(input: LocaleIdentifier) -> Result<Self, Self::Error> {
LanguageIdentifier::try_from_locale_bytes(input.identifier.as_bytes())
}
}
impl TryFrom<LocaleIdentifier> for Locale {
type Error = ParseError;
fn try_from(input: LocaleIdentifier) -> Result<Self, Self::Error> {
Locale::try_from_str(&input.identifier)
}
}
impl TryFrom<LocaleSubtags> for LanguageIdentifier {
type Error = ParseError;
fn try_from(subtags: LocaleSubtags) -> Result<Self, Self::Error> {
let language = if let Some(lang) = subtags.language {
lang.parse().expect("Failed to parse language subtag")
} else {
subtags::Language::UNKNOWN
};
let script = subtags
.script
.map(|s| s.parse().expect("Failed to parse script subtag."));
let region = subtags
.region
.map(|s| s.parse().expect("Failed to parse region subtag."));
let variants = subtags
.variants
.iter()
.map(|v| v.parse().expect("Failed to parse variant subtag."))
.collect::<Vec<_>>();
Ok(LanguageIdentifier {
language,
script,
region,
variants: subtags::Variants::from_vec_unchecked(variants),
})
}
}
impl TryFrom<LocaleSubtags> for Locale {
type Error = ParseError;
fn try_from(subtags: LocaleSubtags) -> Result<Self, Self::Error> {
let language = if let Some(lang) = subtags.language {
lang.parse().expect("Failed to parse language subtag")
} else {
subtags::Language::UNKNOWN
};
let script = subtags
.script
.map(|s| s.parse().expect("Failed to parse script subtag."));
let region = subtags
.region
.map(|s| s.parse().expect("Failed to parse region subtag."));
let variants = subtags
.variants
.iter()
.map(|v| v.parse().expect("Failed to parse variant subtag."))
.collect::<Vec<_>>();
let extensions = if let Some(e) = subtags.extensions {
e.try_into().expect("Failed to parse extensions.")
} else {
Extensions::default()
};
Ok(Locale {
id: LanguageIdentifier {
language,
script,
region,
variants: subtags::Variants::from_vec_unchecked(variants),
},
extensions,
})
}
}
impl From<LocaleError> for ParseError {
fn from(e: LocaleError) -> Self {
match e.error.as_str() {
"InvalidLanguage" => ParseError::InvalidLanguage,
"InvalidSubtag" => ParseError::InvalidSubtag,
"InvalidExtension" => ParseError::InvalidExtension,
"DuplicatedExtension" => ParseError::DuplicatedExtension,
_ => unreachable!("Unknown error name"),
}
}
}
#[derive(Debug, Deserialize)]
pub struct LocaleTest {
pub input: LocaleInfo,
pub output: LocaleInfo,
}

156
vendor/icu_locale_core/tests/langid.rs vendored Normal file
View File

@@ -0,0 +1,156 @@
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
mod fixtures;
use std::convert::TryInto;
use writeable::*;
use icu_locale_core::{subtags, LanguageIdentifier, ParseError};
type Result = std::result::Result<LanguageIdentifier, ParseError>;
fn test_langid_fixtures(tests: Vec<fixtures::LocaleTest>) {
for test in tests {
match test.output {
fixtures::LocaleInfo::String(s) => {
if let fixtures::LocaleInfo::Object(ref o) = &test.input {
if o.field_type == "Locale" {
continue;
}
}
let input: LanguageIdentifier = test.input.try_into().expect("Parsing failed.");
assert_writeable_eq!(input, s);
}
fixtures::LocaleInfo::Error(err) => {
let err: ParseError = err.into();
let input: Result = test.input.try_into();
assert_eq!(input, Err(err));
}
fixtures::LocaleInfo::Identifier(ident) => {
let input: LanguageIdentifier = test.input.try_into().expect("Parsing failed.");
let output: LanguageIdentifier = ident.try_into().expect("Parsing failed.");
assert_eq!(input, output);
}
fixtures::LocaleInfo::Object(o) => {
let input: LanguageIdentifier = test.input.try_into().expect("Parsing failed.");
let output: LanguageIdentifier = o.try_into().expect("Parsing failed.");
assert_eq!(input, output);
}
}
}
}
#[test]
fn test_langid_parsing() {
let data = serde_json::from_str(include_str!("fixtures/langid.json"))
.expect("Failed to read a fixture");
test_langid_fixtures(data);
}
#[test]
fn test_langid_invalid() {
let data = serde_json::from_str(include_str!("fixtures/invalid.json"))
.expect("Failed to read a fixture");
test_langid_fixtures(data);
}
#[test]
fn test_langid_canonicalize() {
let data = serde_json::from_str(include_str!("fixtures/canonicalize.json"))
.expect("Failed to read a fixture");
test_langid_fixtures(data);
}
#[test]
fn test_langid_from_locale() {
let data = serde_json::from_str(include_str!("fixtures/locale.json"))
.expect("Failed to read a fixture");
test_langid_fixtures(data);
}
#[test]
fn test_langid_subtag_language() {
let mut lang: subtags::Language = "en".parse().expect("Failed to parse a language.");
assert_eq!(lang.as_str(), "en");
lang = subtags::Language::UNKNOWN;
assert!(lang.is_unknown());
assert_writeable_eq!(lang, "und");
}
#[test]
fn test_langid_subtag_region() {
let region: subtags::Region = "en".parse().expect("Failed to parse a region.");
assert_eq!(region.as_str(), "EN");
assert_writeable_eq!(region, "EN");
}
#[test]
fn test_langid_subtag_script() {
let script: subtags::Script = "Latn".parse().expect("Failed to parse a script.");
assert_eq!(script.as_str(), "Latn");
assert_writeable_eq!(script, "Latn");
}
#[test]
fn test_langid_subtag_variant() {
let variant: subtags::Variant = "macos".parse().expect("Failed to parse a variant.");
assert_eq!(variant.as_str(), "macos");
assert_writeable_eq!(variant, "macos");
}
#[test]
fn test_langid_subtag_variants() {
let variant: subtags::Variant = "macos".parse().expect("Failed to parse a variant.");
let mut variants = subtags::Variants::from_vec_unchecked(vec![variant]);
assert_eq!(variants.first(), Some(&variant));
variants.clear();
assert_eq!(variants.len(), 0);
}
#[test]
fn test_langid_normalizing_eq_str() {
let tests: Vec<fixtures::LocaleTest> =
serde_json::from_str(include_str!("fixtures/langid.json"))
.expect("Failed to read a fixture");
for test in tests {
let parsed: LanguageIdentifier = test.input.try_into().expect("Parsing failed.");
assert!(parsed.normalizing_eq(&parsed.write_to_string()));
}
// Check that trailing characters are not ignored
let lang: LanguageIdentifier = "en".parse().expect("Parsing failed.");
assert!(!lang.normalizing_eq("en-US"));
}
#[test]
fn test_langid_strict_cmp() {
let tests: Vec<fixtures::LocaleTest> =
serde_json::from_str(include_str!("fixtures/langid.json"))
.expect("Failed to read a fixture");
let bcp47_strings = tests
.iter()
.map(|t| match t.input {
fixtures::LocaleInfo::String(ref s) => s.as_str(),
_ => panic!("Invalid fixture"),
})
.collect::<Vec<&str>>();
for a in bcp47_strings.iter() {
for b in bcp47_strings.iter() {
let a_langid = a
.parse::<LanguageIdentifier>()
.expect("Invalid BCP-47 in fixture");
let a_normalized = a_langid.write_to_string();
let string_cmp = a_normalized.as_bytes().cmp(b.as_bytes());
let test_cmp = a_langid.strict_cmp(b.as_bytes());
assert_eq!(string_cmp, test_cmp, "{a:?}/{b:?}");
}
}
}

119
vendor/icu_locale_core/tests/locale.rs vendored Normal file
View File

@@ -0,0 +1,119 @@
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
mod fixtures;
use std::convert::TryInto;
use writeable::*;
use icu_locale_core::{LanguageIdentifier, Locale, ParseError};
type Result = std::result::Result<Locale, ParseError>;
fn test_langid_fixtures(tests: Vec<fixtures::LocaleTest>) {
for test in tests {
match test.output {
fixtures::LocaleInfo::String(s) => {
let input: Locale = test.input.try_into().expect("Parsing failed.");
assert_writeable_eq!(input, s);
}
fixtures::LocaleInfo::Error(err) => {
let err: ParseError = err.into();
let input: Result = test.input.try_into();
assert_eq!(input, Err(err));
}
fixtures::LocaleInfo::Identifier(ident) => {
let input: Locale = test.input.try_into().expect("Parsing failed.");
let output: Locale = ident.clone().try_into().expect("Parsing failed.");
assert_eq!(input, output);
assert_writeable_eq!(input, ident.identifier);
}
fixtures::LocaleInfo::Object(o) => {
let input: Locale = test.input.try_into().expect("Parsing failed.");
let output: Locale = o.try_into().expect("Parsing failed.");
assert_eq!(input, output);
}
}
}
}
#[test]
fn test_locale_parsing() {
let data = serde_json::from_str(include_str!("fixtures/locale.json"))
.expect("Failed to read a fixture");
test_langid_fixtures(data);
}
#[test]
fn test_locale_invalid() {
let data = serde_json::from_str(include_str!("fixtures/invalid-extensions.json"))
.expect("Failed to read a fixture");
test_langid_fixtures(data);
}
#[test]
fn test_locale_is_empty() {
let locale: Locale = Locale::UNKNOWN;
assert!(locale.extensions.is_empty());
assert_writeable_eq!(locale, "und");
}
#[test]
fn test_locale_conversions() {
let locale: Locale = Locale::UNKNOWN;
let langid: LanguageIdentifier = locale.clone().into();
let locale2: Locale = langid.into();
assert_eq!(locale, locale2);
}
#[test]
fn test_locale_canonicalize() {
let data = serde_json::from_str(include_str!("fixtures/canonicalize.json"))
.expect("Failed to read a fixture");
test_langid_fixtures(data);
}
#[test]
fn test_locale_normalizing_eq_str() {
let tests: Vec<fixtures::LocaleTest> =
serde_json::from_str(include_str!("fixtures/locale.json"))
.expect("Failed to read a fixture");
for test in tests {
let parsed: Locale = test.input.try_into().expect("Parsing failed.");
assert!(parsed.normalizing_eq(&parsed.write_to_string()));
}
// Check that trailing characters are not ignored
let locale: Locale = "en".parse().expect("Parsing failed.");
assert!(!locale.normalizing_eq("en-US"));
}
#[test]
fn test_locale_strict_cmp() {
let tests: Vec<fixtures::LocaleTest> =
serde_json::from_str(include_str!("fixtures/locale.json"))
.expect("Failed to read a fixture");
let bcp47_strings = tests
.iter()
.map(|t| match t.input {
fixtures::LocaleInfo::Identifier(ref s) => s.identifier.as_str(),
_ => match t.output {
fixtures::LocaleInfo::Identifier(ref s) => s.identifier.as_str(),
_ => panic!("No string in fixture input or output: {t:?}"),
},
})
.collect::<Vec<&str>>();
for a in bcp47_strings.iter() {
for b in bcp47_strings.iter() {
let a_langid = a.parse::<Locale>().expect("Invalid BCP-47 in fixture");
let a_normalized = a_langid.write_to_string();
let string_cmp = a_normalized.as_bytes().cmp(b.as_bytes());
let test_cmp = a_langid.strict_cmp(b.as_bytes());
assert_eq!(string_cmp, test_cmp, "{a:?}/{b:?}");
}
}
}