chore: checkpoint before Python removal

This commit is contained in:
2026-03-26 22:33:59 +00:00
parent 683cec9307
commit e568ddf82a
29972 changed files with 11269302 additions and 2 deletions

445
vendor/icu_locale_core/src/data.rs vendored Normal file
View File

@@ -0,0 +1,445 @@
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
use crate::extensions::unicode as unicode_ext;
use crate::subtags::{Language, Region, Script, Subtag, Variant};
#[cfg(feature = "alloc")]
use crate::ParseError;
use crate::{LanguageIdentifier, Locale};
use core::cmp::Ordering;
use core::default::Default;
use core::fmt;
use core::hash::Hash;
#[cfg(feature = "alloc")]
use core::str::FromStr;
/// A locale type optimized for use in fallbacking and the ICU4X data pipeline.
///
/// [`DataLocale`] contains less functionality than [`Locale`] but more than
/// [`LanguageIdentifier`] for better size and performance while still meeting
/// the needs of the ICU4X data pipeline.
///
/// You can create a [`DataLocale`] from a borrowed [`Locale`], which is more
/// efficient than cloning the [`Locale`], but less efficient than converting an owned
/// [`Locale`]:
///
/// ```
/// use icu_locale_core::locale;
/// use icu_provider::DataLocale;
///
/// let locale1 = locale!("en-u-ca-buddhist");
/// let data_locale = DataLocale::from(&locale1);
/// ```
///
/// [`DataLocale`] only supports `-u-sd` keywords, to reflect the current state of CLDR data
/// lookup and fallback. This may change in the future.
///
/// ```
/// use icu_locale_core::{locale, Locale};
/// use icu_provider::DataLocale;
///
/// let locale = "hi-IN-t-en-h0-hybrid-u-attr-ca-buddhist-sd-inas"
/// .parse::<Locale>()
/// .unwrap();
///
/// assert_eq!(
/// DataLocale::from(locale),
/// DataLocale::from(locale!("hi-IN-u-sd-inas"))
/// );
/// ```
#[derive(Clone, Copy, PartialEq, Hash, Eq)]
#[non_exhaustive]
pub struct DataLocale {
/// Language subtag
pub language: Language,
/// Script subtag
pub script: Option<Script>,
/// Region subtag
pub region: Option<Region>,
/// Variant subtag
pub variant: Option<Variant>,
/// Subivision (-u-sd-) subtag
pub subdivision: Option<Subtag>,
}
impl Default for DataLocale {
fn default() -> Self {
Self {
language: Language::UNKNOWN,
script: None,
region: None,
variant: None,
subdivision: None,
}
}
}
impl DataLocale {
/// `const` version of `Default::default`
pub const fn default() -> Self {
DataLocale {
language: Language::UNKNOWN,
script: None,
region: None,
variant: None,
subdivision: None,
}
}
}
impl Default for &DataLocale {
fn default() -> Self {
static DEFAULT: DataLocale = DataLocale::default();
&DEFAULT
}
}
impl fmt::Debug for DataLocale {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
write!(f, "DataLocale{{{self}}}")
}
}
impl_writeable_for_each_subtag_str_no_test!(DataLocale, selff, selff.script.is_none() && selff.region.is_none() && selff.variant.is_none() && selff.subdivision.is_none() => Some(selff.language.as_str()));
impl From<LanguageIdentifier> for DataLocale {
fn from(langid: LanguageIdentifier) -> Self {
Self::from(&langid)
}
}
impl From<Locale> for DataLocale {
fn from(locale: Locale) -> Self {
Self::from(&locale)
}
}
impl From<&LanguageIdentifier> for DataLocale {
fn from(langid: &LanguageIdentifier) -> Self {
Self {
language: langid.language,
script: langid.script,
region: langid.region,
variant: langid.variants.iter().copied().next(),
subdivision: None,
}
}
}
impl From<&Locale> for DataLocale {
fn from(locale: &Locale) -> Self {
let mut r = Self::from(&locale.id);
r.subdivision = locale
.extensions
.unicode
.keywords
.get(&unicode_ext::key!("sd"))
.and_then(|v| v.as_single_subtag().copied());
r
}
}
/// ✨ *Enabled with the `alloc` Cargo feature.*
#[cfg(feature = "alloc")]
impl FromStr for DataLocale {
type Err = ParseError;
#[inline]
fn from_str(s: &str) -> Result<Self, Self::Err> {
Self::try_from_str(s)
}
}
impl DataLocale {
#[inline]
/// Parses a [`DataLocale`].
///
/// ✨ *Enabled with the `alloc` Cargo feature.*
#[cfg(feature = "alloc")]
pub fn try_from_str(s: &str) -> Result<Self, ParseError> {
Self::try_from_utf8(s.as_bytes())
}
/// Parses a [`DataLocale`] from a UTF-8 byte slice.
///
/// ✨ *Enabled with the `alloc` Cargo feature.*
#[cfg(feature = "alloc")]
pub fn try_from_utf8(code_units: &[u8]) -> Result<Self, ParseError> {
let locale = Locale::try_from_utf8(code_units)?;
if locale.id.variants.len() > 1
|| !locale.extensions.transform.is_empty()
|| !locale.extensions.private.is_empty()
|| !locale.extensions.other.is_empty()
|| !locale.extensions.unicode.attributes.is_empty()
{
return Err(ParseError::InvalidExtension);
}
let unicode_extensions_count = locale.extensions.unicode.keywords.iter().count();
if unicode_extensions_count != 0
&& (unicode_extensions_count != 1
|| !locale
.extensions
.unicode
.keywords
.contains_key(&unicode_ext::key!("sd")))
{
return Err(ParseError::InvalidExtension);
}
Ok(locale.into())
}
pub(crate) fn for_each_subtag_str<E, F>(&self, f: &mut F) -> Result<(), E>
where
F: FnMut(&str) -> Result<(), E>,
{
f(self.language.as_str())?;
if let Some(ref script) = self.script {
f(script.as_str())?;
}
if let Some(ref region) = self.region {
f(region.as_str())?;
}
if let Some(ref single_variant) = self.variant {
f(single_variant.as_str())?;
}
if let Some(ref subdivision) = self.subdivision {
f("u")?;
f("sd")?;
f(subdivision.as_str())?;
}
Ok(())
}
fn as_tuple(
&self,
) -> (
Language,
Option<Script>,
Option<Region>,
Option<Variant>,
Option<Subtag>,
) {
(
self.language,
self.script,
self.region,
self.variant,
self.subdivision,
)
}
/// Returns an ordering suitable for use in [`BTreeSet`].
///
/// [`BTreeSet`]: alloc::collections::BTreeSet
pub fn total_cmp(&self, other: &Self) -> Ordering {
self.as_tuple().cmp(&other.as_tuple())
}
/// Compare this [`DataLocale`] with BCP-47 bytes.
///
/// The return value is equivalent to what would happen if you first converted this
/// [`DataLocale`] to a BCP-47 string and then performed a byte comparison.
///
/// This function is case-sensitive and results in a *total order*, so it is appropriate for
/// binary search. The only argument producing [`Ordering::Equal`] is `self.to_string()`.
///
/// # Examples
///
/// ```
/// use core::cmp::Ordering;
/// use icu_provider::DataLocale;
///
/// let bcp47_strings: &[&str] = &[
/// "ca",
/// "ca-ES",
/// "ca-ES-u-sd-esct",
/// "ca-ES-valencia",
/// "cat",
/// "pl-Latn-PL",
/// "und",
/// "und-fonipa",
/// "zh",
/// ];
///
/// for ab in bcp47_strings.windows(2) {
/// let a = ab[0];
/// let b = ab[1];
/// assert_eq!(a.cmp(b), Ordering::Less, "strings: {} < {}", a, b);
/// let a_loc: DataLocale = a.parse().unwrap();
/// assert_eq!(
/// a_loc.strict_cmp(a.as_bytes()),
/// Ordering::Equal,
/// "strict_cmp: {} == {}",
/// a_loc,
/// a
/// );
/// assert_eq!(
/// a_loc.strict_cmp(b.as_bytes()),
/// Ordering::Less,
/// "strict_cmp: {} < {}",
/// a_loc,
/// b
/// );
/// let b_loc: DataLocale = b.parse().unwrap();
/// assert_eq!(
/// b_loc.strict_cmp(b.as_bytes()),
/// Ordering::Equal,
/// "strict_cmp: {} == {}",
/// b_loc,
/// b
/// );
/// assert_eq!(
/// b_loc.strict_cmp(a.as_bytes()),
/// Ordering::Greater,
/// "strict_cmp: {} > {}",
/// b_loc,
/// a
/// );
/// }
/// ```
///
/// Comparison against invalid strings:
///
/// ```
/// use icu_provider::DataLocale;
///
/// let invalid_strings: &[&str] = &[
/// // Less than "ca-ES"
/// "CA",
/// "ar-x-gbp-FOO",
/// // Greater than "ca-AR"
/// "ca_ES",
/// "ca-ES-x-gbp-FOO",
/// ];
///
/// let data_locale = "ca-ES".parse::<DataLocale>().unwrap();
///
/// for s in invalid_strings.iter() {
/// let expected_ordering = "ca-AR".cmp(s);
/// let actual_ordering = data_locale.strict_cmp(s.as_bytes());
/// assert_eq!(expected_ordering, actual_ordering, "{}", s);
/// }
/// ```
pub fn strict_cmp(&self, other: &[u8]) -> Ordering {
writeable::cmp_utf8(self, other)
}
/// Returns whether this [`DataLocale`] is `und` in the locale and extensions portion.
///
/// # Examples
///
/// ```
/// use icu_provider::DataLocale;
///
/// assert!("und".parse::<DataLocale>().unwrap().is_unknown());
/// assert!(!"de-u-sd-denw".parse::<DataLocale>().unwrap().is_unknown());
/// assert!(!"und-ES".parse::<DataLocale>().unwrap().is_unknown());
/// ```
pub fn is_unknown(&self) -> bool {
self.language.is_unknown()
&& self.script.is_none()
&& self.region.is_none()
&& self.variant.is_none()
&& self.subdivision.is_none()
}
/// Converts this `DataLocale` into a [`Locale`].
pub fn into_locale(self) -> Locale {
Locale {
id: LanguageIdentifier {
language: self.language,
script: self.script,
region: self.region,
variants: self
.variant
.map(crate::subtags::Variants::from_variant)
.unwrap_or_default(),
},
extensions: {
let mut extensions = crate::extensions::Extensions::default();
if let Some(sd) = self.subdivision {
extensions.unicode = unicode_ext::Unicode {
keywords: unicode_ext::Keywords::new_single(
unicode_ext::key!("sd"),
unicode_ext::Value::from_subtag(Some(sd)),
),
..Default::default()
}
}
extensions
},
}
}
}
#[test]
fn test_data_locale_to_string() {
struct TestCase {
pub locale: &'static str,
pub expected: &'static str,
}
for cas in [
TestCase {
locale: "und",
expected: "und",
},
TestCase {
locale: "und-u-sd-sdd",
expected: "und-u-sd-sdd",
},
TestCase {
locale: "en-ZA-u-sd-zaa",
expected: "en-ZA-u-sd-zaa",
},
] {
let locale = cas.locale.parse::<DataLocale>().unwrap();
writeable::assert_writeable_eq!(locale, cas.expected);
}
}
#[test]
fn test_data_locale_from_string() {
#[derive(Debug)]
struct TestCase {
pub input: &'static str,
pub success: bool,
}
for cas in [
TestCase {
input: "und",
success: true,
},
TestCase {
input: "und-u-cu-gbp",
success: false,
},
TestCase {
input: "en-ZA-u-sd-zaa",
success: true,
},
TestCase {
input: "en...",
success: false,
},
] {
let data_locale = match (DataLocale::from_str(cas.input), cas.success) {
(Ok(l), true) => l,
(Err(_), false) => {
continue;
}
(Ok(_), false) => {
panic!("DataLocale parsed but it was supposed to fail: {cas:?}");
}
(Err(_), true) => {
panic!("DataLocale was supposed to parse but it failed: {cas:?}");
}
};
writeable::assert_writeable_eq!(data_locale, cas.input);
}
}

22
vendor/icu_locale_core/src/databake.rs vendored Normal file
View File

@@ -0,0 +1,22 @@
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
use crate::LanguageIdentifier;
use databake::*;
impl Bake for LanguageIdentifier {
fn bake(&self, env: &CrateEnv) -> TokenStream {
env.insert("icu_locale_core");
let repr = self.to_string();
if self.variants.len() <= 1 {
quote! {
icu_locale_core::langid!(#repr)
}
} else {
quote! {
icu_locale_core::LanguageIdentifier::try_from_str(#repr).unwrap()
}
}
}
}

View File

@@ -0,0 +1,399 @@
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
//! Unicode Extensions provide a mechanism to extend the [`LanguageIdentifier`] with
//! additional bits of information - a combination of a [`LanguageIdentifier`] and [`Extensions`]
//! is called [`Locale`].
//!
//! There are four types of extensions:
//!
//! * [`Unicode Extensions`] - marked as `u`.
//! * [`Transform Extensions`] - marked as `t`.
//! * [`Private Use Extensions`] - marked as `x`.
//! * [`Other Extensions`] - marked as any `a-z` except of `u`, `t` and `x`.
//!
//! One can think of extensions as a bag of extra information on top of basic 4 [`subtags`].
//!
//! Notice: `Other` extension type is currently not supported.
//!
//! # Examples
//!
//! ```
//! use icu::locale::extensions::unicode::{Key, Value};
//! use icu::locale::Locale;
//!
//! let loc: Locale = "en-US-u-ca-buddhist-t-en-us-h0-hybrid-x-foo"
//! .parse()
//! .expect("Failed to parse.");
//!
//! assert_eq!(loc.id.language, "en".parse().unwrap());
//! assert_eq!(loc.id.script, None);
//! assert_eq!(loc.id.region, Some("US".parse().unwrap()));
//! assert_eq!(loc.id.variants.len(), 0);
//!
//! let key: Key = "ca".parse().expect("Parsing key failed.");
//! let value: Value = "buddhist".parse().expect("Parsing value failed.");
//! assert_eq!(loc.extensions.unicode.keywords.get(&key), Some(&value));
//! ```
//!
//! # Syntactic vs Semantic Extension Handling
//!
//! This module is useful when you need to work with Locale extensions at a syntactic level,
//! perhaps for parsing or generating locale identifiers that include any syntactically valid
//! extensions.
//! For handling and validating known CLDR values with semantic meaning, see the
//! [`crate::preferences::extensions`] module.
//!
//! [`LanguageIdentifier`]: super::LanguageIdentifier
//! [`Locale`]: super::Locale
//! [`subtags`]: super::subtags
//! [`Other Extensions`]: other
//! [`Private Use Extensions`]: private
//! [`Transform Extensions`]: transform
//! [`Unicode Extensions`]: unicode
pub mod other;
pub mod private;
pub mod transform;
pub mod unicode;
use core::cmp::Ordering;
use other::Other;
use private::{Private, PRIVATE_EXT_CHAR};
use transform::{Transform, TRANSFORM_EXT_CHAR};
use unicode::{Unicode, UNICODE_EXT_CHAR};
#[cfg(feature = "alloc")]
use alloc::vec::Vec;
use crate::parser::ParseError;
#[cfg(feature = "alloc")]
use crate::parser::SubtagIterator;
use crate::subtags;
/// Defines the type of extension.
#[derive(Debug, PartialEq, Eq, Clone, Hash, PartialOrd, Ord, Copy)]
#[non_exhaustive]
pub enum ExtensionType {
/// Transform Extension Type marked as `t`.
Transform,
/// Unicode Extension Type marked as `u`.
Unicode,
/// Private Extension Type marked as `x`.
Private,
/// All other extension types.
Other(u8),
}
impl ExtensionType {
#[allow(dead_code)]
pub(crate) const fn try_from_byte_slice(key: &[u8]) -> Result<Self, ParseError> {
if let [b] = key {
Self::try_from_byte(*b)
} else {
Err(ParseError::InvalidExtension)
}
}
pub(crate) const fn try_from_byte(key: u8) -> Result<Self, ParseError> {
let key = key.to_ascii_lowercase();
match key as char {
UNICODE_EXT_CHAR => Ok(Self::Unicode),
TRANSFORM_EXT_CHAR => Ok(Self::Transform),
PRIVATE_EXT_CHAR => Ok(Self::Private),
'a'..='z' => Ok(Self::Other(key)),
_ => Err(ParseError::InvalidExtension),
}
}
pub(crate) const fn try_from_utf8(code_units: &[u8]) -> Result<Self, ParseError> {
let &[first] = code_units else {
return Err(ParseError::InvalidExtension);
};
Self::try_from_byte(first)
}
}
/// A map of extensions associated with a given [`Locale`](crate::Locale).
#[derive(Debug, Default, PartialEq, Eq, Clone, Hash)]
#[non_exhaustive]
pub struct Extensions {
/// A representation of the data for a Unicode extension, when present in the locale identifier.
pub unicode: Unicode,
/// A representation of the data for a transform extension, when present in the locale identifier.
pub transform: Transform,
/// A representation of the data for a private-use extension, when present in the locale identifier.
pub private: Private,
/// A sequence of any other extensions that are present in the locale identifier but are not formally
/// [defined](https://unicode.org/reports/tr35/) and represented explicitly as [`Unicode`], [`Transform`],
/// and [`Private`] are.
#[cfg(feature = "alloc")]
pub other: Vec<Other>,
/// A sequence of any other extensions that are present in the locale identifier but are not formally
/// [defined](https://unicode.org/reports/tr35/) and represented explicitly as [`Unicode`], [`Transform`],
/// and [`Private`] are.
#[cfg(not(feature = "alloc"))]
pub other: &'static [Other],
}
impl Extensions {
/// Returns a new empty map of extensions. Same as [`default()`](Default::default()), but is `const`.
///
/// # Examples
///
/// ```
/// use icu::locale::extensions::Extensions;
///
/// assert_eq!(Extensions::new(), Extensions::default());
/// ```
#[inline]
pub const fn new() -> Self {
Self {
unicode: Unicode::new(),
transform: Transform::new(),
private: Private::new(),
#[cfg(feature = "alloc")]
other: Vec::new(),
#[cfg(not(feature = "alloc"))]
other: &[],
}
}
/// Function to create a new map of extensions containing exactly one unicode extension, callable in `const`
/// context.
#[inline]
pub const fn from_unicode(unicode: Unicode) -> Self {
Self {
unicode,
transform: Transform::new(),
private: Private::new(),
#[cfg(feature = "alloc")]
other: Vec::new(),
#[cfg(not(feature = "alloc"))]
other: &[],
}
}
/// Returns whether there are no extensions present.
///
/// # Examples
///
/// ```
/// use icu::locale::Locale;
///
/// let loc: Locale = "en-US-u-foo".parse().expect("Parsing failed.");
///
/// assert!(!loc.extensions.is_empty());
/// ```
pub fn is_empty(&self) -> bool {
self.unicode.is_empty()
&& self.transform.is_empty()
&& self.private.is_empty()
&& self.other.is_empty()
}
#[expect(clippy::type_complexity)]
pub(crate) fn as_tuple(
&self,
) -> (
(&unicode::Attributes, &unicode::Keywords),
(
Option<(
subtags::Language,
Option<subtags::Script>,
Option<subtags::Region>,
&subtags::Variants,
)>,
&transform::Fields,
),
&private::Private,
&[other::Other],
) {
(
self.unicode.as_tuple(),
self.transform.as_tuple(),
&self.private,
&self.other,
)
}
/// Returns an ordering suitable for use in [`BTreeSet`].
///
/// The ordering may or may not be equivalent to string ordering, and it
/// may or may not be stable across ICU4X releases.
///
/// [`BTreeSet`]: alloc::collections::BTreeSet
pub fn total_cmp(&self, other: &Self) -> Ordering {
self.as_tuple().cmp(&other.as_tuple())
}
/// Retains the specified extension types, clearing all others.
///
/// ✨ *Enabled with the `alloc` Cargo feature.*
///
/// # Examples
///
/// ```
/// use icu::locale::extensions::ExtensionType;
/// use icu::locale::Locale;
///
/// let loc: Locale =
/// "und-a-hello-t-mul-u-world-z-zzz-x-extra".parse().unwrap();
///
/// let mut only_unicode = loc.clone();
/// only_unicode
/// .extensions
/// .retain_by_type(|t| t == ExtensionType::Unicode);
/// assert_eq!(only_unicode, "und-u-world".parse().unwrap());
///
/// let mut only_t_z = loc.clone();
/// only_t_z.extensions.retain_by_type(|t| {
/// t == ExtensionType::Transform || t == ExtensionType::Other(b'z')
/// });
/// assert_eq!(only_t_z, "und-t-mul-z-zzz".parse().unwrap());
/// ```
#[cfg(feature = "alloc")]
pub fn retain_by_type<F>(&mut self, mut predicate: F)
where
F: FnMut(ExtensionType) -> bool,
{
if !predicate(ExtensionType::Unicode) {
self.unicode.clear();
}
if !predicate(ExtensionType::Transform) {
self.transform.clear();
}
if !predicate(ExtensionType::Private) {
self.private.clear();
}
#[cfg(feature = "alloc")]
self.other
.retain(|o| predicate(ExtensionType::Other(o.get_ext_byte())));
}
#[cfg(feature = "alloc")]
pub(crate) fn try_from_iter(iter: &mut SubtagIterator) -> Result<Self, ParseError> {
let mut unicode = None;
let mut transform = None;
let mut private = None;
let mut other = Vec::new();
while let Some(subtag) = iter.next() {
if subtag.is_empty() {
return Err(ParseError::InvalidExtension);
}
let &[subtag] = subtag else {
return Err(ParseError::InvalidExtension);
};
match ExtensionType::try_from_byte(subtag) {
Ok(ExtensionType::Unicode) => {
if unicode.is_some() {
return Err(ParseError::DuplicatedExtension);
}
unicode = Some(Unicode::try_from_iter(iter)?);
}
Ok(ExtensionType::Transform) => {
if transform.is_some() {
return Err(ParseError::DuplicatedExtension);
}
transform = Some(Transform::try_from_iter(iter)?);
}
Ok(ExtensionType::Private) => {
if private.is_some() {
return Err(ParseError::DuplicatedExtension);
}
private = Some(Private::try_from_iter(iter)?);
}
Ok(ExtensionType::Other(ext)) => {
if other.iter().any(|o: &Other| o.get_ext_byte() == ext) {
return Err(ParseError::DuplicatedExtension);
}
let parsed = Other::try_from_iter(ext, iter)?;
if let Err(idx) = other.binary_search(&parsed) {
other.insert(idx, parsed);
} else {
return Err(ParseError::InvalidExtension);
}
}
_ => return Err(ParseError::InvalidExtension),
}
}
Ok(Self {
unicode: unicode.unwrap_or_default(),
transform: transform.unwrap_or_default(),
private: private.unwrap_or_default(),
other,
})
}
pub(crate) fn for_each_subtag_str<E, F>(&self, f: &mut F) -> Result<(), E>
where
F: FnMut(&str) -> Result<(), E>,
{
let mut wrote_tu = false;
// Alphabetic by singleton
self.other.iter().try_for_each(|other| {
if other.get_ext() > TRANSFORM_EXT_CHAR && !wrote_tu {
// Since 't' and 'u' are next to each other in alphabetical
// order, write both now.
self.transform.for_each_subtag_str(f, true)?;
self.unicode.for_each_subtag_str(f, true)?;
wrote_tu = true;
}
other.for_each_subtag_str(f, true)?;
Ok(())
})?;
if !wrote_tu {
self.transform.for_each_subtag_str(f, true)?;
self.unicode.for_each_subtag_str(f, true)?;
}
// Private must be written last, since it allows single character
// keys. Extensions must also be written in alphabetical order,
// which would seem to imply that other extensions `y` and `z` are
// invalid, but this is not specified.
self.private.for_each_subtag_str(f, true)?;
Ok(())
}
}
impl_writeable_for_each_subtag_str_no_test!(Extensions);
#[test]
fn test_writeable() {
use crate::Locale;
use writeable::assert_writeable_eq;
assert_writeable_eq!(Extensions::new(), "");
assert_writeable_eq!(
"my-t-my-d0-zawgyi".parse::<Locale>().unwrap().extensions,
"t-my-d0-zawgyi",
);
assert_writeable_eq!(
"ar-SA-u-ca-islamic-civil"
.parse::<Locale>()
.unwrap()
.extensions,
"u-ca-islamic-civil",
);
assert_writeable_eq!(
"en-001-x-foo-bar".parse::<Locale>().unwrap().extensions,
"x-foo-bar",
);
assert_writeable_eq!(
"und-t-m0-true".parse::<Locale>().unwrap().extensions,
"t-m0-true",
);
assert_writeable_eq!(
"und-a-foo-t-foo-u-foo-w-foo-z-foo-x-foo"
.parse::<Locale>()
.unwrap()
.extensions,
"a-foo-t-foo-u-foo-w-foo-z-foo-x-foo",
);
}

View File

@@ -0,0 +1,260 @@
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
//! Other Use Extensions is a list of extensions other than unicode,
//! transform or private.
//!
//! Those extensions are treated as a pass-through, and no Unicode related
//! behavior depends on them.
//!
//! The main struct for this extension is [`Other`] which is a list of [`Subtag`]s.
//!
//! # Examples
//!
//! ```
//! use icu::locale::extensions::other::Other;
//! use icu::locale::Locale;
//!
//! let mut loc: Locale = "en-US-a-foo-faa".parse().expect("Parsing failed.");
//! ```
#[cfg(feature = "alloc")]
use core::str::FromStr;
#[cfg(feature = "alloc")]
use super::ExtensionType;
#[cfg(feature = "alloc")]
use crate::parser::ParseError;
#[cfg(feature = "alloc")]
use crate::parser::SubtagIterator;
use crate::shortvec::ShortBoxSlice;
use crate::subtags::Subtag;
#[cfg(feature = "alloc")]
use alloc::vec::Vec;
/// A list of [`Other Use Extensions`] as defined in [`Unicode Locale
/// Identifier`] specification.
///
/// Those extensions are treated as a pass-through, and no Unicode related
/// behavior depends on them.
///
/// # Examples
///
/// ```
/// use icu::locale::extensions::other::Other;
/// use icu::locale::subtags::Subtag;
///
/// let subtag1: Subtag = "foo".parse().expect("Failed to parse a Subtag.");
/// let subtag2: Subtag = "bar".parse().expect("Failed to parse a Subtag.");
///
/// let other = Other::from_vec_unchecked(b'a', vec![subtag1, subtag2]);
/// assert_eq!(&other.to_string(), "a-foo-bar");
/// ```
///
/// [`Other Use Extensions`]: https://unicode.org/reports/tr35/#other_extensions
/// [`Unicode Locale Identifier`]: https://unicode.org/reports/tr35/#Unicode_locale_identifier
#[derive(Clone, PartialEq, Eq, Debug, Default, Hash, PartialOrd, Ord)]
pub struct Other {
// Safety invariant: must be ASCII
ext: u8,
keys: ShortBoxSlice<Subtag>,
}
impl Other {
/// A constructor which takes a str slice, parses it and
/// produces a well-formed [`Other`].
///
/// ✨ *Enabled with the `alloc` Cargo feature.*
#[inline]
#[cfg(feature = "alloc")]
pub fn try_from_str(s: &str) -> Result<Self, ParseError> {
Self::try_from_utf8(s.as_bytes())
}
/// See [`Self::try_from_str`]
///
/// ✨ *Enabled with the `alloc` Cargo feature.*
#[cfg(feature = "alloc")]
pub fn try_from_utf8(code_units: &[u8]) -> Result<Self, ParseError> {
let mut iter = SubtagIterator::new(code_units);
let ext = iter.next().ok_or(ParseError::InvalidExtension)?;
if let ExtensionType::Other(b) = ExtensionType::try_from_byte_slice(ext)? {
return Self::try_from_iter(b, &mut iter);
}
Err(ParseError::InvalidExtension)
}
/// A constructor which takes a pre-sorted list of [`Subtag`].
///
/// ✨ *Enabled with the `alloc` Cargo feature.*
///
/// # Panics
///
/// Panics if `ext` is not ASCII alphabetic.
///
/// # Examples
///
/// ```
/// use icu::locale::extensions::other::Other;
/// use icu::locale::subtags::Subtag;
///
/// let subtag1: Subtag = "foo".parse().expect("Failed to parse a Subtag.");
/// let subtag2: Subtag = "bar".parse().expect("Failed to parse a Subtag.");
///
/// let other = Other::from_vec_unchecked(b'a', vec![subtag1, subtag2]);
/// assert_eq!(&other.to_string(), "a-foo-bar");
/// ```
#[cfg(feature = "alloc")]
pub fn from_vec_unchecked(ext: u8, keys: Vec<Subtag>) -> Self {
Self::from_short_slice_unchecked(ext, keys.into())
}
#[allow(dead_code)]
pub(crate) fn from_short_slice_unchecked(ext: u8, keys: ShortBoxSlice<Subtag>) -> Self {
assert!(ext.is_ascii_alphabetic());
// Safety invariant upheld here: ext checked as ASCII above
Self { ext, keys }
}
#[cfg(feature = "alloc")]
pub(crate) fn try_from_iter(ext: u8, iter: &mut SubtagIterator) -> Result<Self, ParseError> {
debug_assert!(matches!(
ExtensionType::try_from_byte(ext),
Ok(ExtensionType::Other(_)),
));
let mut keys = ShortBoxSlice::new();
while let Some(subtag) = iter.peek() {
if !Subtag::valid_key(subtag) {
break;
}
if let Ok(key) = Subtag::try_from_utf8(subtag) {
keys.push(key);
}
iter.next();
}
if keys.is_empty() {
Err(ParseError::InvalidExtension)
} else {
Ok(Self::from_short_slice_unchecked(ext, keys))
}
}
/// Gets the tag character for this extension as a &str.
///
/// # Examples
///
/// ```
/// use icu::locale::Locale;
///
/// let loc: Locale = "und-a-hello-world".parse().unwrap();
/// let other_ext = &loc.extensions.other[0];
/// assert_eq!(other_ext.get_ext_str(), "a");
/// ```
pub fn get_ext_str(&self) -> &str {
debug_assert!(self.ext.is_ascii_alphabetic());
// Safety: from safety invariant on self.ext (that it is ASCII)
unsafe { core::str::from_utf8_unchecked(core::slice::from_ref(&self.ext)) }
}
/// Gets the tag character for this extension as a char.
///
/// # Examples
///
/// ```
/// use icu::locale::Locale;
///
/// let loc: Locale = "und-a-hello-world".parse().unwrap();
/// let other_ext = &loc.extensions.other[0];
/// assert_eq!(other_ext.get_ext(), 'a');
/// ```
pub fn get_ext(&self) -> char {
self.ext as char
}
/// Gets the tag character for this extension as a byte.
///
/// # Examples
///
/// ```
/// use icu::locale::Locale;
///
/// let loc: Locale = "und-a-hello-world".parse().unwrap();
/// let other_ext = &loc.extensions.other[0];
/// assert_eq!(other_ext.get_ext_byte(), b'a');
/// ```
pub fn get_ext_byte(&self) -> u8 {
self.ext
}
pub(crate) fn for_each_subtag_str<E, F>(&self, f: &mut F, with_ext: bool) -> Result<(), E>
where
F: FnMut(&str) -> Result<(), E>,
{
if self.keys.is_empty() {
return Ok(());
}
if with_ext {
f(self.get_ext_str())?;
}
self.keys.iter().map(|t| t.as_str()).try_for_each(f)
}
}
/// ✨ *Enabled with the `alloc` Cargo feature.*
#[cfg(feature = "alloc")]
impl FromStr for Other {
type Err = ParseError;
#[inline]
fn from_str(s: &str) -> Result<Self, Self::Err> {
Self::try_from_str(s)
}
}
writeable::impl_display_with_writeable!(Other, #[cfg(feature = "alloc")]);
impl writeable::Writeable for Other {
fn write_to<W: core::fmt::Write + ?Sized>(&self, sink: &mut W) -> core::fmt::Result {
if self.keys.is_empty() {
return Ok(());
}
sink.write_str(self.get_ext_str())?;
for key in self.keys.iter() {
sink.write_char('-')?;
writeable::Writeable::write_to(key, sink)?;
}
Ok(())
}
fn writeable_length_hint(&self) -> writeable::LengthHint {
if self.keys.is_empty() {
return writeable::LengthHint::exact(0);
};
let mut result = writeable::LengthHint::exact(1);
for key in self.keys.iter() {
result += writeable::Writeable::writeable_length_hint(key) + 1;
}
result
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_other_extension_fromstr() {
let oe: Other = "o-foo-bar".parse().expect("Failed to parse Other");
assert_eq!(oe.to_string(), "o-foo-bar");
let oe: Result<Other, _> = "o".parse();
assert!(oe.is_err());
}
}

View File

@@ -0,0 +1,257 @@
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
//! Private Use Extensions is a list of extensions intended for
//! private use.
//!
//! Those extensions are treated as a pass-through, and no Unicode related
//! behavior depends on them.
//!
//! The main struct for this extension is [`Private`] which is a list of [`Subtag`]s.
//!
//! # Examples
//!
//! ```
//! use icu::locale::extensions::private::subtag;
//! use icu::locale::{locale, Locale};
//!
//! let mut loc: Locale = "en-US-x-foo-faa".parse().expect("Parsing failed.");
//!
//! assert!(loc.extensions.private.contains(&subtag!("foo")));
//! assert_eq!(loc.extensions.private.iter().next(), Some(&subtag!("foo")));
//!
//! loc.extensions.private.clear();
//!
//! assert!(loc.extensions.private.is_empty());
//! assert_eq!(loc, locale!("en-US"));
//! ```
mod other;
#[cfg(feature = "alloc")]
use alloc::vec::Vec;
use core::ops::Deref;
#[cfg(feature = "alloc")]
use core::str::FromStr;
#[doc(inline)]
pub use other::{subtag, Subtag};
#[cfg(feature = "alloc")]
use super::ExtensionType;
#[cfg(feature = "alloc")]
use crate::parser::ParseError;
#[cfg(feature = "alloc")]
use crate::parser::SubtagIterator;
use crate::shortvec::ShortBoxSlice;
pub(crate) const PRIVATE_EXT_CHAR: char = 'x';
pub(crate) const PRIVATE_EXT_STR: &str = "x";
/// A list of [`Private Use Extensions`] as defined in [`Unicode Locale
/// Identifier`] specification.
///
/// Those extensions are treated as a pass-through, and no Unicode related
/// behavior depends on them.
///
/// # Examples
///
/// ```
/// use icu::locale::extensions::private::{Private, Subtag};
///
/// let subtag1: Subtag = "foo".parse().expect("Failed to parse a Subtag.");
/// let subtag2: Subtag = "bar".parse().expect("Failed to parse a Subtag.");
///
/// let private = Private::from_vec_unchecked(vec![subtag1, subtag2]);
/// assert_eq!(&private.to_string(), "x-foo-bar");
/// ```
///
/// [`Private Use Extensions`]: https://unicode.org/reports/tr35/#pu_extensions
/// [`Unicode Locale Identifier`]: https://unicode.org/reports/tr35/#Unicode_locale_identifier
#[derive(Clone, PartialEq, Eq, Debug, Default, Hash, PartialOrd, Ord)]
pub struct Private(ShortBoxSlice<Subtag>);
impl Private {
/// Returns a new empty list of private-use extensions. Same as [`default()`](Default::default()), but is `const`.
///
/// # Examples
///
/// ```
/// use icu::locale::extensions::private::Private;
///
/// assert_eq!(Private::new(), Private::default());
/// ```
#[inline]
pub const fn new() -> Self {
Self(ShortBoxSlice::new())
}
/// A constructor which takes a str slice, parses it and
/// produces a well-formed [`Private`].
///
/// ✨ *Enabled with the `alloc` Cargo feature.*
#[inline]
#[cfg(feature = "alloc")]
pub fn try_from_str(s: &str) -> Result<Self, ParseError> {
Self::try_from_utf8(s.as_bytes())
}
/// See [`Self::try_from_str`]
///
/// ✨ *Enabled with the `alloc` Cargo feature.*
#[cfg(feature = "alloc")]
pub fn try_from_utf8(code_units: &[u8]) -> Result<Self, ParseError> {
let mut iter = SubtagIterator::new(code_units);
let ext = iter.next().ok_or(ParseError::InvalidExtension)?;
if let ExtensionType::Private = ExtensionType::try_from_byte_slice(ext)? {
return Self::try_from_iter(&mut iter);
}
Err(ParseError::InvalidExtension)
}
/// A constructor which takes a pre-sorted list of [`Subtag`].
///
/// ✨ *Enabled with the `alloc` Cargo feature.*
///
/// # Examples
///
/// ```
/// use icu::locale::extensions::private::{Private, Subtag};
///
/// let subtag1: Subtag = "foo".parse().expect("Failed to parse a Subtag.");
/// let subtag2: Subtag = "bar".parse().expect("Failed to parse a Subtag.");
///
/// let private = Private::from_vec_unchecked(vec![subtag1, subtag2]);
/// assert_eq!(&private.to_string(), "x-foo-bar");
/// ```
#[cfg(feature = "alloc")]
pub fn from_vec_unchecked(input: Vec<Subtag>) -> Self {
Self(input.into())
}
/// A constructor which takes a single [`Subtag`].
///
/// # Examples
///
/// ```
/// use icu::locale::extensions::private::{Private, Subtag};
///
/// let subtag: Subtag = "foo".parse().expect("Failed to parse a Subtag.");
///
/// let private = Private::new_single(subtag);
/// assert_eq!(&private.to_string(), "x-foo");
/// ```
pub const fn new_single(input: Subtag) -> Self {
Self(ShortBoxSlice::new_single(input))
}
/// Empties the [`Private`] list.
///
/// # Examples
///
/// ```
/// use icu::locale::extensions::private::{Private, Subtag};
///
/// let subtag1: Subtag = "foo".parse().expect("Failed to parse a Subtag.");
/// let subtag2: Subtag = "bar".parse().expect("Failed to parse a Subtag.");
/// let mut private = Private::from_vec_unchecked(vec![subtag1, subtag2]);
///
/// assert_eq!(&private.to_string(), "x-foo-bar");
///
/// private.clear();
///
/// assert_eq!(private, Private::new());
/// ```
pub fn clear(&mut self) {
self.0.clear();
}
#[cfg(feature = "alloc")]
pub(crate) fn try_from_iter(iter: &mut SubtagIterator) -> Result<Self, ParseError> {
let keys = iter
.map(Subtag::try_from_utf8)
.collect::<Result<ShortBoxSlice<_>, _>>()?;
if keys.is_empty() {
Err(ParseError::InvalidExtension)
} else {
Ok(Self(keys))
}
}
pub(crate) fn for_each_subtag_str<E, F>(&self, f: &mut F, with_ext: bool) -> Result<(), E>
where
F: FnMut(&str) -> Result<(), E>,
{
if self.is_empty() {
return Ok(());
}
if with_ext {
f(PRIVATE_EXT_STR)?;
}
self.deref().iter().map(|t| t.as_str()).try_for_each(f)
}
}
/// ✨ *Enabled with the `alloc` Cargo feature.*
#[cfg(feature = "alloc")]
impl FromStr for Private {
type Err = ParseError;
#[inline]
fn from_str(s: &str) -> Result<Self, Self::Err> {
Self::try_from_str(s)
}
}
writeable::impl_display_with_writeable!(Private, #[cfg(feature = "alloc")]);
impl writeable::Writeable for Private {
fn write_to<W: core::fmt::Write + ?Sized>(&self, sink: &mut W) -> core::fmt::Result {
if self.is_empty() {
return Ok(());
}
sink.write_char(PRIVATE_EXT_CHAR)?;
for key in self.iter() {
sink.write_char('-')?;
writeable::Writeable::write_to(key, sink)?;
}
Ok(())
}
fn writeable_length_hint(&self) -> writeable::LengthHint {
if self.is_empty() {
return writeable::LengthHint::exact(0);
}
let mut result = writeable::LengthHint::exact(1);
for key in self.iter() {
result += writeable::Writeable::writeable_length_hint(key) + 1;
}
result
}
}
impl Deref for Private {
type Target = [Subtag];
fn deref(&self) -> &Self::Target {
self.0.deref()
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_private_extension_fromstr() {
let pe: Private = "x-foo-bar-l-baz".parse().expect("Failed to parse Private");
assert_eq!(pe.to_string(), "x-foo-bar-l-baz");
let pe: Result<Private, _> = "x".parse();
assert!(pe.is_err());
}
}

View File

@@ -0,0 +1,47 @@
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
impl_tinystr_subtag!(
/// A single item used in a list of [`Private`](super::Private) extensions.
///
/// The subtag has to be an ASCII alphanumerical string no shorter than
/// one character and no longer than eight.
///
/// # Examples
///
/// ```
/// use icu::locale::extensions::private::Subtag;
///
/// let subtag1: Subtag = "Foo".parse()
/// .expect("Failed to parse a Subtag.");
///
/// assert_eq!(subtag1.as_str(), "foo");
/// ```
///
/// Notice: This is different from the generic [`Subtag`](crate::subtags::Subtag)
/// which is between two and eight characters.
///
/// ```
/// use icu::locale::extensions::private;
/// use icu::locale::subtags;
///
/// let subtag: Result<private::Subtag, _> = "f".parse();
/// assert!(subtag.is_ok());
///
/// let subtag: Result<subtags::Subtag, _> = "f".parse();
/// assert!(subtag.is_err());
/// ```
Subtag,
extensions::private,
subtag,
extensions_private_subtag,
1..=8,
s,
s.is_ascii_alphanumeric(),
s.to_ascii_lowercase(),
s.is_ascii_alphanumeric() && s.is_ascii_lowercase(),
InvalidExtension,
["foo12"],
["toolooong"],
);

View File

@@ -0,0 +1,234 @@
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
use core::borrow::Borrow;
use litemap::LiteMap;
use super::Key;
use super::Value;
/// A list of [`Key`]-[`Value`] pairs representing functional information
/// about content transformations.
///
/// Here are examples of fields used in Unicode:
/// - `s0`, `d0` - Transform source/destination
/// - `t0` - Machine Translation
/// - `h0` - Hybrid Locale Identifiers
///
/// You can find the full list in [`Unicode BCP 47 T Extension`] section of LDML.
///
/// [`Unicode BCP 47 T Extension`]: https://unicode.org/reports/tr35/tr35.html#BCP47_T_Extension
///
/// # Examples
///
/// ```
/// use icu::locale::extensions::transform::{key, Fields, Value};
///
/// let value = "hybrid".parse::<Value>().expect("Failed to parse a Value.");
/// let fields = [(key!("h0"), value)].into_iter().collect::<Fields>();
///
/// assert_eq!(&fields.to_string(), "h0-hybrid");
/// ```
#[derive(Clone, PartialEq, Eq, Debug, Default, Hash, PartialOrd, Ord)]
pub struct Fields(Inner);
#[cfg(feature = "alloc")]
type Inner = LiteMap<Key, Value>;
#[cfg(not(feature = "alloc"))]
type Inner = LiteMap<Key, Value, &'static [(Key, Value)]>;
impl Fields {
/// Returns a new empty list of key-value pairs. Same as [`default()`](Default::default()), but is `const`.
///
/// # Examples
///
/// ```
/// use icu::locale::extensions::transform::Fields;
///
/// assert_eq!(Fields::new(), Fields::default());
/// ```
#[inline]
pub const fn new() -> Self {
Self(LiteMap::new())
}
/// Returns `true` if there are no fields.
///
/// # Examples
///
/// ```
/// use icu::locale::locale;
/// use icu::locale::Locale;
///
/// let loc1 = Locale::try_from_str("und-t-h0-hybrid").unwrap();
/// let loc2 = locale!("und-u-ca-buddhist");
///
/// assert!(!loc1.extensions.transform.fields.is_empty());
/// assert!(loc2.extensions.transform.fields.is_empty());
/// ```
pub fn is_empty(&self) -> bool {
self.0.is_empty()
}
/// Empties the [`Fields`] list.
///
/// Returns the old list.
///
/// # Examples
///
/// ```
/// use icu::locale::extensions::transform::{key, Fields, Value};
///
/// let value = "hybrid".parse::<Value>().expect("Failed to parse a Value.");
/// let mut fields = [(key!("h0"), value)].into_iter().collect::<Fields>();
///
/// assert_eq!(&fields.to_string(), "h0-hybrid");
///
/// fields.clear();
///
/// assert_eq!(fields, Fields::new());
/// ```
pub fn clear(&mut self) -> Self {
core::mem::take(self)
}
/// Returns `true` if the list contains a [`Value`] for the specified [`Key`].
///
///
/// # Examples
///
/// ```
/// use icu::locale::extensions::transform::{Fields, Key, Value};
///
/// let key: Key = "h0".parse().expect("Failed to parse a Key.");
/// let value: Value = "hybrid".parse().expect("Failed to parse a Value.");
/// let mut fields = [(key, value)].into_iter().collect::<Fields>();
///
/// let key: Key = "h0".parse().expect("Failed to parse a Key.");
/// assert!(&fields.contains_key(&key));
/// ```
pub fn contains_key<Q>(&self, key: &Q) -> bool
where
Key: Borrow<Q>,
Q: Ord,
{
self.0.contains_key(key)
}
/// Returns a reference to the [`Value`] corresponding to the [`Key`].
///
///
/// # Examples
///
/// ```
/// use icu::locale::extensions::transform::{key, Fields, Value};
///
/// let value = "hybrid".parse::<Value>().unwrap();
/// let fields = [(key!("h0"), value.clone())]
/// .into_iter()
/// .collect::<Fields>();
///
/// assert_eq!(fields.get(&key!("h0")), Some(&value));
/// ```
pub fn get<Q>(&self, key: &Q) -> Option<&Value>
where
Key: Borrow<Q>,
Q: Ord,
{
self.0.get(key)
}
/// Sets the specified keyword, returning the old value if it already existed.
///
/// ✨ *Enabled with the `alloc` Cargo feature.*
///
/// # Examples
///
/// ```
/// use icu::locale::extensions::transform::{key, Value};
/// use icu::locale::Locale;
///
/// let lower = "lower".parse::<Value>().expect("valid extension subtag");
/// let casefold = "casefold".parse::<Value>().expect("valid extension subtag");
///
/// let mut loc: Locale = "en-t-hi-d0-casefold"
/// .parse()
/// .expect("valid BCP-47 identifier");
/// let old_value = loc.extensions.transform.fields.set(key!("d0"), lower);
///
/// assert_eq!(old_value, Some(casefold));
/// assert_eq!(loc, "en-t-hi-d0-lower".parse().unwrap());
/// ```
#[cfg(feature = "alloc")]
pub fn set(&mut self, key: Key, value: Value) -> Option<Value> {
self.0.insert(key, value)
}
/// Retains a subset of fields as specified by the predicate function.
///
/// ✨ *Enabled with the `alloc` Cargo feature.*
///
/// # Examples
///
/// ```
/// use icu::locale::extensions::transform::key;
/// use icu::locale::Locale;
///
/// let mut loc: Locale = "und-t-h0-hybrid-d0-hex-m0-xml".parse().unwrap();
///
/// loc.extensions
/// .transform
/// .fields
/// .retain_by_key(|&k| k == key!("h0"));
/// assert_eq!(loc, "und-t-h0-hybrid".parse().unwrap());
///
/// loc.extensions
/// .transform
/// .fields
/// .retain_by_key(|&k| k == key!("d0"));
/// assert_eq!(loc, Locale::UNKNOWN);
/// ```
#[cfg(feature = "alloc")]
pub fn retain_by_key<F>(&mut self, mut predicate: F)
where
F: FnMut(&Key) -> bool,
{
self.0.retain(|k, _| predicate(k))
}
pub(crate) fn for_each_subtag_str<E, F>(&self, f: &mut F) -> Result<(), E>
where
F: FnMut(&str) -> Result<(), E>,
{
for (k, v) in self.0.iter() {
f(k.as_str())?;
v.for_each_subtag_str(f)?;
}
Ok(())
}
/// This needs to be its own method to help with type inference in helpers.rs
#[cfg(test)]
pub(crate) fn from_tuple_vec(v: Vec<(Key, Value)>) -> Self {
v.into_iter().collect()
}
}
/// ✨ *Enabled with the `alloc` Cargo feature.*
#[cfg(feature = "alloc")]
impl From<LiteMap<Key, Value>> for Fields {
fn from(map: LiteMap<Key, Value>) -> Self {
Self(map)
}
}
/// ✨ *Enabled with the `alloc` Cargo feature.*
#[cfg(feature = "alloc")]
impl core::iter::FromIterator<(Key, Value)> for Fields {
fn from_iter<I: IntoIterator<Item = (Key, Value)>>(iter: I) -> Self {
LiteMap::from_iter(iter).into()
}
}
impl_writeable_for_key_value!(Fields, "h0", "hybrid", "m0", "m0-true");

View File

@@ -0,0 +1,32 @@
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
impl_tinystr_subtag!(
/// A key used in a list of [`Fields`](super::Fields).
///
/// The key has to be a two ASCII characters long, with the first
/// character being alphabetic, and the second being a number.
///
/// # Examples
///
/// ```
/// use icu::locale::extensions::transform::Key;
///
/// let key1: Key = "k0".parse().expect("Failed to parse a Key.");
///
/// assert_eq!(key1.as_str(), "k0");
/// ```
Key,
extensions::transform,
key,
extensions_transform_key,
2..=2,
s,
s.all_bytes()[0].is_ascii_alphabetic() && s.all_bytes()[1].is_ascii_digit(),
s.to_ascii_lowercase(),
s.all_bytes()[0].is_ascii_lowercase() && s.all_bytes()[1].is_ascii_digit(),
InvalidExtension,
["k0"],
["", "k", "0k", "k12"],
);

View File

@@ -0,0 +1,336 @@
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
//! Transform Extensions provide information on content transformations in a given locale.
//!
//! The main struct for this extension is [`Transform`] which contains [`Fields`] and an
//! optional [`LanguageIdentifier`].
//!
//! [`LanguageIdentifier`]: super::super::LanguageIdentifier
//!
//! # Examples
//!
//! ```
//! use icu::locale::extensions::transform::{Fields, Key, Transform, Value};
//! use icu::locale::{LanguageIdentifier, Locale};
//!
//! let mut loc: Locale =
//! "en-US-t-es-ar-h0-hybrid".parse().expect("Parsing failed.");
//!
//! let lang: LanguageIdentifier =
//! "es-AR".parse().expect("Parsing LanguageIdentifier failed.");
//!
//! let key: Key = "h0".parse().expect("Parsing key failed.");
//! let value: Value = "hybrid".parse().expect("Parsing value failed.");
//!
//! assert_eq!(loc.extensions.transform.lang, Some(lang));
//! assert!(loc.extensions.transform.fields.contains_key(&key));
//! assert_eq!(loc.extensions.transform.fields.get(&key), Some(&value));
//!
//! assert_eq!(&loc.extensions.transform.to_string(), "t-es-ar-h0-hybrid");
//! ```
mod fields;
mod key;
mod value;
use core::cmp::Ordering;
#[cfg(feature = "alloc")]
use core::str::FromStr;
pub use fields::Fields;
#[doc(inline)]
pub use key::{key, Key};
pub use value::Value;
#[cfg(feature = "alloc")]
use super::ExtensionType;
#[cfg(feature = "alloc")]
use crate::parser::SubtagIterator;
#[cfg(feature = "alloc")]
use crate::parser::{parse_language_identifier_from_iter, ParseError, ParserMode};
#[cfg(feature = "alloc")]
use crate::shortvec::ShortBoxSlice;
use crate::subtags;
#[cfg(feature = "alloc")]
use crate::subtags::Language;
use crate::LanguageIdentifier;
#[cfg(feature = "alloc")]
use litemap::LiteMap;
pub(crate) const TRANSFORM_EXT_CHAR: char = 't';
pub(crate) const TRANSFORM_EXT_STR: &str = "t";
/// A list of [`Unicode BCP47 T Extensions`] as defined in [`Unicode Locale
/// Identifier`] specification.
///
/// Transform extension carries information about source language or script of
/// transformed content, including content that has been transliterated, transcribed,
/// or translated, or in some other way influenced by the source (See [`RFC 6497`] for details).
///
/// # Examples
///
/// ```
/// use icu::locale::extensions::transform::{Key, Value};
/// use icu::locale::{LanguageIdentifier, Locale};
///
/// let mut loc: Locale =
/// "de-t-en-us-h0-hybrid".parse().expect("Parsing failed.");
///
/// let en_us: LanguageIdentifier = "en-US".parse().expect("Parsing failed.");
///
/// assert_eq!(loc.extensions.transform.lang, Some(en_us));
/// let key: Key = "h0".parse().expect("Parsing key failed.");
/// let value: Value = "hybrid".parse().expect("Parsing value failed.");
/// assert_eq!(loc.extensions.transform.fields.get(&key), Some(&value));
/// ```
/// [`Unicode BCP47 T Extensions`]: https://unicode.org/reports/tr35/#t_Extension
/// [`RFC 6497`]: https://www.ietf.org/rfc/rfc6497.txt
/// [`Unicode Locale Identifier`]: https://unicode.org/reports/tr35/#Unicode_locale_identifier
#[derive(Clone, PartialEq, Eq, Debug, Default, Hash)]
#[allow(clippy::exhaustive_structs)] // spec-backed stable datastructure
pub struct Transform {
/// The [`LanguageIdentifier`] specified with this locale extension, or `None` if not present.
pub lang: Option<LanguageIdentifier>,
/// The key-value pairs present in this locale extension, with each extension key subtag
/// associated to its provided value subtag.
pub fields: Fields,
}
impl Transform {
/// Returns a new empty map of Transform extensions. Same as [`default()`](Default::default()), but is `const`.
///
/// # Examples
///
/// ```
/// use icu::locale::extensions::transform::Transform;
///
/// assert_eq!(Transform::new(), Transform::default());
/// ```
#[inline]
pub const fn new() -> Self {
Self {
lang: None,
fields: Fields::new(),
}
}
/// A constructor which takes a str slice, parses it and
/// produces a well-formed [`Transform`].
///
/// ✨ *Enabled with the `alloc` Cargo feature.*
#[inline]
#[cfg(feature = "alloc")]
pub fn try_from_str(s: &str) -> Result<Self, ParseError> {
Self::try_from_utf8(s.as_bytes())
}
/// See [`Self::try_from_str`]
///
/// ✨ *Enabled with the `alloc` Cargo feature.*
#[cfg(feature = "alloc")]
pub fn try_from_utf8(code_units: &[u8]) -> Result<Self, ParseError> {
let mut iter = SubtagIterator::new(code_units);
let ext = iter.next().ok_or(ParseError::InvalidExtension)?;
if let ExtensionType::Transform = ExtensionType::try_from_byte_slice(ext)? {
return Self::try_from_iter(&mut iter);
}
Err(ParseError::InvalidExtension)
}
/// Returns `true` if there are no tfields and no tlang in the `TransformExtensionList`.
///
/// # Examples
///
/// ```
/// use icu::locale::Locale;
///
/// let mut loc: Locale = "en-US-t-es-ar".parse().expect("Parsing failed.");
///
/// assert!(!loc.extensions.transform.is_empty());
/// ```
pub fn is_empty(&self) -> bool {
self.lang.is_none() && self.fields.is_empty()
}
/// Clears the transform extension, effectively removing it from the locale.
///
/// # Examples
///
/// ```
/// use icu::locale::Locale;
///
/// let mut loc: Locale = "en-US-t-es-ar".parse().unwrap();
/// loc.extensions.transform.clear();
/// assert_eq!(loc, "en-US".parse().unwrap());
/// ```
pub fn clear(&mut self) {
self.lang = None;
self.fields.clear();
}
#[expect(clippy::type_complexity)]
pub(crate) fn as_tuple(
&self,
) -> (
Option<(
subtags::Language,
Option<subtags::Script>,
Option<subtags::Region>,
&subtags::Variants,
)>,
&Fields,
) {
(self.lang.as_ref().map(|l| l.as_tuple()), &self.fields)
}
/// Returns an ordering suitable for use in [`BTreeSet`].
///
/// The ordering may or may not be equivalent to string ordering, and it
/// may or may not be stable across ICU4X releases.
///
/// [`BTreeSet`]: alloc::collections::BTreeSet
pub fn total_cmp(&self, other: &Self) -> Ordering {
self.as_tuple().cmp(&other.as_tuple())
}
#[cfg(feature = "alloc")]
pub(crate) fn try_from_iter(iter: &mut SubtagIterator) -> Result<Self, ParseError> {
let mut tlang = None;
let mut tfields = LiteMap::new();
if let Some(subtag) = iter.peek() {
if Language::try_from_utf8(subtag).is_ok() {
tlang = Some(parse_language_identifier_from_iter(
iter,
ParserMode::Partial,
)?);
}
}
let mut current_tkey = None;
let mut current_tvalue = ShortBoxSlice::new();
let mut has_current_tvalue = false;
while let Some(subtag) = iter.peek() {
if let Some(tkey) = current_tkey {
if let Ok(val) = Value::parse_subtag(subtag) {
has_current_tvalue = true;
if let Some(val) = val {
current_tvalue.push(val);
}
} else {
if !has_current_tvalue {
return Err(ParseError::InvalidExtension);
}
tfields.try_insert(tkey, Value::from_short_slice_unchecked(current_tvalue));
current_tkey = None;
current_tvalue = ShortBoxSlice::new();
has_current_tvalue = false;
continue;
}
} else if let Ok(tkey) = Key::try_from_utf8(subtag) {
current_tkey = Some(tkey);
} else {
break;
}
iter.next();
}
if let Some(tkey) = current_tkey {
if !has_current_tvalue {
return Err(ParseError::InvalidExtension);
}
tfields.try_insert(tkey, Value::from_short_slice_unchecked(current_tvalue));
}
if tlang.is_none() && tfields.is_empty() {
Err(ParseError::InvalidExtension)
} else {
Ok(Self {
lang: tlang,
fields: tfields.into(),
})
}
}
pub(crate) fn for_each_subtag_str<E, F>(&self, f: &mut F, with_ext: bool) -> Result<(), E>
where
F: FnMut(&str) -> Result<(), E>,
{
if self.is_empty() {
return Ok(());
}
if with_ext {
f(TRANSFORM_EXT_STR)?;
}
if let Some(lang) = &self.lang {
lang.for_each_subtag_str_lowercased(f)?;
}
self.fields.for_each_subtag_str(f)
}
}
/// ✨ *Enabled with the `alloc` Cargo feature.*
#[cfg(feature = "alloc")]
impl FromStr for Transform {
type Err = ParseError;
#[inline]
fn from_str(s: &str) -> Result<Self, Self::Err> {
Self::try_from_str(s)
}
}
writeable::impl_display_with_writeable!(Transform, #[cfg(feature = "alloc")]);
impl writeable::Writeable for Transform {
fn write_to<W: core::fmt::Write + ?Sized>(&self, sink: &mut W) -> core::fmt::Result {
if self.is_empty() {
return Ok(());
}
sink.write_char(TRANSFORM_EXT_CHAR)?;
if let Some(lang) = &self.lang {
sink.write_char('-')?;
lang.write_lowercased_to(sink)?;
}
if !self.fields.is_empty() {
sink.write_char('-')?;
writeable::Writeable::write_to(&self.fields, sink)?;
}
Ok(())
}
fn writeable_length_hint(&self) -> writeable::LengthHint {
if self.is_empty() {
return writeable::LengthHint::exact(0);
}
let mut result = writeable::LengthHint::exact(1);
if let Some(lang) = &self.lang {
result += writeable::Writeable::writeable_length_hint(lang) + 1;
}
if !self.fields.is_empty() {
result += writeable::Writeable::writeable_length_hint(&self.fields) + 1;
}
result
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_transform_extension_fromstr() {
let te: Transform = "t-en-us-h0-hybrid"
.parse()
.expect("Failed to parse Transform");
assert_eq!(te.to_string(), "t-en-us-h0-hybrid");
let te: Result<Transform, _> = "t".parse();
assert!(te.is_err());
}
}

View File

@@ -0,0 +1,165 @@
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
use crate::parser::ParseError;
#[cfg(feature = "alloc")]
use crate::parser::SubtagIterator;
use crate::shortvec::ShortBoxSlice;
use crate::subtags::{subtag, Subtag};
use core::ops::RangeInclusive;
#[cfg(feature = "alloc")]
use core::str::FromStr;
/// A value used in a list of [`Fields`](super::Fields).
///
/// The value has to be a sequence of one or more alphanumerical strings
/// separated by `-`.
/// Each part of the sequence has to be no shorter than three characters and no
/// longer than 8.
///
/// # Examples
///
/// ```
/// use icu::locale::extensions::transform::Value;
///
/// "hybrid".parse::<Value>().expect("Valid Value.");
///
/// "hybrid-foobar".parse::<Value>().expect("Valid Value.");
///
/// "no".parse::<Value>().expect_err("Invalid Value.");
/// ```
#[derive(Debug, PartialEq, Eq, Clone, Hash, PartialOrd, Ord, Default)]
pub struct Value(ShortBoxSlice<Subtag>);
#[allow(dead_code)]
const TYPE_LENGTH: RangeInclusive<usize> = 3..=8;
const TRUE_TVALUE: Subtag = subtag!("true");
impl Value {
/// A constructor which takes a str slice, parses it and
/// produces a well-formed [`Value`].
///
/// ✨ *Enabled with the `alloc` Cargo feature.*
///
/// # Examples
///
/// ```
/// use icu::locale::extensions::transform::Value;
///
/// let value = Value::try_from_str("hybrid").expect("Parsing failed.");
/// ```
#[inline]
#[cfg(feature = "alloc")]
pub fn try_from_str(s: &str) -> Result<Self, ParseError> {
Self::try_from_utf8(s.as_bytes())
}
/// See [`Self::try_from_str`]
///
/// ✨ *Enabled with the `alloc` Cargo feature.*
#[cfg(feature = "alloc")]
pub fn try_from_utf8(code_units: &[u8]) -> Result<Self, ParseError> {
let mut v = ShortBoxSlice::default();
let mut has_value = false;
for subtag in SubtagIterator::new(code_units) {
if !Self::is_type_subtag(subtag) {
return Err(ParseError::InvalidExtension);
}
has_value = true;
let val = Subtag::try_from_utf8(subtag).map_err(|_| ParseError::InvalidExtension)?;
if val != TRUE_TVALUE {
v.push(val);
}
}
if !has_value {
return Err(ParseError::InvalidExtension);
}
Ok(Self(v))
}
#[allow(dead_code)]
pub(crate) fn from_short_slice_unchecked(input: ShortBoxSlice<Subtag>) -> Self {
Self(input)
}
#[allow(dead_code)]
pub(crate) fn is_type_subtag(t: &[u8]) -> bool {
TYPE_LENGTH.contains(&t.len()) && t.iter().all(u8::is_ascii_alphanumeric)
}
#[allow(dead_code)]
pub(crate) fn parse_subtag(t: &[u8]) -> Result<Option<Subtag>, ParseError> {
if !TYPE_LENGTH.contains(&t.len()) {
return Err(ParseError::InvalidExtension);
}
let s = Subtag::try_from_utf8(t).map_err(|_| ParseError::InvalidSubtag)?;
let s = s.to_ascii_lowercase();
if s == TRUE_TVALUE {
Ok(None)
} else {
Ok(Some(s))
}
}
pub(crate) fn for_each_subtag_str<E, F>(&self, f: &mut F) -> Result<(), E>
where
F: FnMut(&str) -> Result<(), E>,
{
if self.0.is_empty() {
f(TRUE_TVALUE.as_str())?;
} else {
self.0.iter().map(Subtag::as_str).try_for_each(f)?;
}
Ok(())
}
}
/// ✨ *Enabled with the `alloc` Cargo feature.*
#[cfg(feature = "alloc")]
impl FromStr for Value {
type Err = ParseError;
#[inline]
fn from_str(s: &str) -> Result<Self, Self::Err> {
Self::try_from_str(s)
}
}
impl_writeable_for_each_subtag_str_no_test!(Value, selff, selff.0.is_empty() => Some("true"));
#[test]
fn test_writeable() {
use writeable::assert_writeable_eq;
let hybrid = "hybrid".parse().unwrap();
let foobar = "foobar".parse().unwrap();
assert_writeable_eq!(Value::default(), "true");
assert_writeable_eq!(
Value::from_short_slice_unchecked(vec![hybrid].into()),
"hybrid"
);
assert_writeable_eq!(
Value::from_short_slice_unchecked(vec![hybrid, foobar].into()),
"hybrid-foobar"
);
}
#[test]
fn test_short_tvalue() {
let value = Value::try_from_str("foo-longstag");
assert!(value.is_ok());
let value = value.unwrap();
assert_eq!(value.0.len(), 2);
for (s, reference) in value.0.iter().zip(&[subtag!("foo"), subtag!("longstag")]) {
assert_eq!(s, reference);
}
let value = Value::try_from_str("foo-ba");
assert!(value.is_err());
}

View File

@@ -0,0 +1,34 @@
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
impl_tinystr_subtag!(
/// An attribute used in a set of [`Attributes`](super::Attributes).
///
/// An attribute has to be a sequence of alphanumerical characters no
/// shorter than three and no longer than eight characters.
///
///
/// # Examples
///
/// ```
/// use icu::locale::extensions::unicode::{attribute, Attribute};
///
/// let attr: Attribute =
/// "buddhist".parse().expect("Failed to parse an Attribute.");
///
/// assert_eq!(attr, attribute!("buddhist"));
/// ```
Attribute,
extensions::unicode,
attribute,
extensions_unicode_attribute,
3..=8,
s,
s.is_ascii_alphanumeric(),
s.to_ascii_lowercase(),
s.is_ascii_alphanumeric() && s.is_ascii_lowercase(),
InvalidExtension,
["foo12"],
["no", "toolooong"],
);

View File

@@ -0,0 +1,206 @@
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
use super::Attribute;
#[cfg(feature = "alloc")]
use crate::parser::SubtagIterator;
use crate::shortvec::ShortBoxSlice;
#[cfg(feature = "alloc")]
use crate::ParseError;
#[cfg(feature = "alloc")]
use alloc::vec::Vec;
use core::ops::Deref;
#[cfg(feature = "alloc")]
use core::str::FromStr;
/// A set of [`Attribute`] elements as defined in [`Unicode Extension Attributes`].
///
/// [`Unicode Extension Attributes`]: https://unicode.org/reports/tr35/tr35.html#u_Extension
///
/// # Examples
///
/// ```
/// use icu::locale::extensions::unicode::{Attribute, Attributes};
///
/// let attribute1: Attribute =
/// "foobar".parse().expect("Failed to parse a variant subtag.");
///
/// let attribute2: Attribute = "testing"
/// .parse()
/// .expect("Failed to parse a variant subtag.");
/// let mut v = vec![attribute1, attribute2];
/// v.sort();
/// v.dedup();
///
/// let attributes: Attributes = Attributes::from_vec_unchecked(v);
/// assert_eq!(attributes.to_string(), "foobar-testing");
/// ```
#[derive(Default, Debug, PartialEq, Eq, Clone, Hash, PartialOrd, Ord)]
pub struct Attributes(ShortBoxSlice<Attribute>);
impl Attributes {
/// Returns a new empty set of attributes. Same as [`default()`](Default::default()), but is `const`.
///
/// # Examples
///
/// ```
/// use icu::locale::extensions::unicode::Attributes;
///
/// assert_eq!(Attributes::new(), Attributes::default());
/// ```
#[inline]
pub const fn new() -> Self {
Self(ShortBoxSlice::new())
}
/// A constructor which takes a str slice, parses it and
/// produces a well-formed [`Attributes`].
///
/// ✨ *Enabled with the `alloc` Cargo feature.*
#[inline]
#[cfg(feature = "alloc")]
pub fn try_from_str(s: &str) -> Result<Self, ParseError> {
Self::try_from_utf8(s.as_bytes())
}
/// See [`Self::try_from_str`]
///
/// ✨ *Enabled with the `alloc` Cargo feature.*
#[cfg(feature = "alloc")]
pub fn try_from_utf8(code_units: &[u8]) -> Result<Self, ParseError> {
let mut iter = SubtagIterator::new(code_units);
Self::try_from_iter(&mut iter)
}
/// A constructor which takes a pre-sorted list of [`Attribute`] elements.
///
/// ✨ *Enabled with the `alloc` Cargo feature.*
///
/// # Examples
///
/// ```
/// use icu::locale::extensions::unicode::{Attribute, Attributes};
///
/// let attribute1: Attribute = "foobar".parse().expect("Parsing failed.");
/// let attribute2: Attribute = "testing".parse().expect("Parsing failed.");
/// let mut v = vec![attribute1, attribute2];
/// v.sort();
/// v.dedup();
///
/// let attributes = Attributes::from_vec_unchecked(v);
/// ```
///
/// Notice: For performance- and memory-constrained environments, it is recommended
/// for the caller to use [`binary_search`](slice::binary_search) instead of [`sort`](slice::sort)
/// and [`dedup`](Vec::dedup()).
#[cfg(feature = "alloc")]
pub fn from_vec_unchecked(input: Vec<Attribute>) -> Self {
Self(input.into())
}
/// Empties the [`Attributes`] list.
///
/// Returns the old list.
///
/// # Examples
///
/// ```
/// use icu::locale::extensions::unicode::{attribute, Attributes};
/// use writeable::assert_writeable_eq;
///
/// let mut attributes = Attributes::from_vec_unchecked(vec![
/// attribute!("foobar"),
/// attribute!("testing"),
/// ]);
///
/// assert_writeable_eq!(attributes, "foobar-testing");
///
/// attributes.clear();
///
/// assert_writeable_eq!(attributes, "");
/// ```
pub fn clear(&mut self) -> Self {
core::mem::take(self)
}
#[cfg(feature = "alloc")]
pub(crate) fn try_from_iter(iter: &mut SubtagIterator) -> Result<Self, ParseError> {
let mut attributes = ShortBoxSlice::new();
while let Some(subtag) = iter.peek() {
if let Ok(attr) = Attribute::try_from_utf8(subtag) {
if let Err(idx) = attributes.binary_search(&attr) {
attributes.insert(idx, attr);
}
} else {
break;
}
iter.next();
}
Ok(Self(attributes))
}
pub(crate) fn for_each_subtag_str<E, F>(&self, f: &mut F) -> Result<(), E>
where
F: FnMut(&str) -> Result<(), E>,
{
self.deref().iter().map(|t| t.as_str()).try_for_each(f)
}
/// Extends the `Attributes` with values from another `Attributes`.
///
/// # Example
///
/// ```
/// use icu::locale::extensions::unicode::Attributes;
///
/// let mut attrs: Attributes = "foobar-foobaz".parse().unwrap();
/// let attrs2: Attributes = "foobar-fooqux".parse().unwrap();
///
/// attrs.extend_from_attributes(attrs2);
///
/// assert_eq!(attrs, "foobar-foobaz-fooqux".parse().unwrap());
/// ```
#[cfg(feature = "alloc")]
pub fn extend_from_attributes(&mut self, other: Attributes) {
for attr in other.0 {
if let Err(idx) = self.binary_search(&attr) {
self.0.insert(idx, attr);
}
}
}
}
/// ✨ *Enabled with the `alloc` Cargo feature.*
#[cfg(feature = "alloc")]
impl FromStr for Attributes {
type Err = ParseError;
#[inline]
fn from_str(s: &str) -> Result<Self, Self::Err> {
Self::try_from_str(s)
}
}
impl_writeable_for_subtag_list!(Attributes, "foobar", "testing");
impl Deref for Attributes {
type Target = [Attribute];
fn deref(&self) -> &[Attribute] {
self.0.deref()
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_attributes_fromstr() {
let attrs: Attributes = "foo-bar".parse().expect("Failed to parse Attributes");
assert_eq!(attrs.to_string(), "bar-foo");
}
}

View File

@@ -0,0 +1,32 @@
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
impl_tinystr_subtag!(
/// A key used in a list of [`Keywords`](super::Keywords).
///
/// The key has to be a two ASCII alphanumerical characters long, with the first
/// character being alphanumeric, and the second being alphabetic.
///
///
/// # Examples
///
/// ```
/// use icu::locale::extensions::unicode::Key;
///
/// assert!("ca".parse::<Key>().is_ok());
/// ```
Key,
extensions::unicode,
key,
extensions_unicode_key,
2..=2,
s,
s.all_bytes()[0].is_ascii_alphanumeric() && s.all_bytes()[1].is_ascii_alphabetic(),
s.to_ascii_lowercase(),
(s.all_bytes()[0].is_ascii_lowercase() || s.all_bytes()[0].is_ascii_digit())
&& s.all_bytes()[1].is_ascii_lowercase(),
InvalidExtension,
["ca", "8a"],
["a", "a8", "abc"],
);

View File

@@ -0,0 +1,453 @@
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
use core::borrow::Borrow;
use core::cmp::Ordering;
#[cfg(feature = "alloc")]
use core::iter::FromIterator;
#[cfg(feature = "alloc")]
use core::str::FromStr;
use litemap::LiteMap;
use super::Key;
use super::Value;
#[cfg(feature = "alloc")]
use crate::parser::ParseError;
#[cfg(feature = "alloc")]
use crate::parser::SubtagIterator;
use crate::shortvec::ShortBoxSlice;
/// A list of [`Key`]-[`Value`] pairs representing functional information
/// about locale's internationalization preferences.
///
/// Here are examples of fields used in Unicode:
/// - `hc` - Hour Cycle (`h11`, `h12`, `h23`, `h24`)
/// - `ca` - Calendar (`buddhist`, `gregory`, ...)
/// - `fw` - First Day Of the Week (`sun`, `mon`, `sat`, ...)
///
/// You can find the full list in [`Unicode BCP 47 U Extension`] section of LDML.
///
/// [`Unicode BCP 47 U Extension`]: https://unicode.org/reports/tr35/tr35.html#Key_And_Type_Definitions_
///
/// # Examples
///
/// Manually build up a [`Keywords`] object:
///
/// ```
/// use icu::locale::extensions::unicode::{key, value, Keywords};
///
/// let keywords = [(key!("hc"), value!("h23"))]
/// .into_iter()
/// .collect::<Keywords>();
///
/// assert_eq!(&keywords.to_string(), "hc-h23");
/// ```
///
/// Access a [`Keywords`] object from a [`Locale`]:
///
/// ```
/// use icu::locale::{
/// extensions::unicode::{key, value},
/// Locale,
/// };
///
/// let loc: Locale = "und-u-hc-h23-kc-true".parse().expect("Valid BCP-47");
///
/// assert_eq!(loc.extensions.unicode.keywords.get(&key!("ca")), None);
/// assert_eq!(
/// loc.extensions.unicode.keywords.get(&key!("hc")),
/// Some(&value!("h23"))
/// );
/// assert_eq!(
/// loc.extensions.unicode.keywords.get(&key!("kc")),
/// Some(&value!("true"))
/// );
///
/// assert_eq!(loc.extensions.unicode.keywords.to_string(), "hc-h23-kc");
/// ```
///
/// [`Locale`]: crate::Locale
#[derive(Clone, PartialEq, Eq, Debug, Default, Hash, PartialOrd, Ord)]
pub struct Keywords(LiteMap<Key, Value, ShortBoxSlice<(Key, Value)>>);
impl Keywords {
/// Returns a new empty list of key-value pairs. Same as [`default()`](Default::default()), but is `const`.
///
/// # Examples
///
/// ```
/// use icu::locale::extensions::unicode::Keywords;
///
/// assert_eq!(Keywords::new(), Keywords::default());
/// ```
#[inline]
pub const fn new() -> Self {
Self(LiteMap::new())
}
/// Create a new list of key-value pairs having exactly one pair, callable in a `const` context.
#[inline]
pub const fn new_single(key: Key, value: Value) -> Self {
Self(LiteMap::from_sorted_store_unchecked(
ShortBoxSlice::new_single((key, value)),
))
}
/// A constructor which takes a str slice, parses it and
/// produces a well-formed [`Keywords`].
///
/// ✨ *Enabled with the `alloc` Cargo feature.*
#[inline]
#[cfg(feature = "alloc")]
pub fn try_from_str(s: &str) -> Result<Self, ParseError> {
Self::try_from_utf8(s.as_bytes())
}
/// See [`Self::try_from_str`]
///
/// ✨ *Enabled with the `alloc` Cargo feature.*
#[cfg(feature = "alloc")]
pub fn try_from_utf8(code_units: &[u8]) -> Result<Self, ParseError> {
let mut iter = SubtagIterator::new(code_units);
Self::try_from_iter(&mut iter)
}
/// Returns `true` if there are no keywords.
///
/// # Examples
///
/// ```
/// use icu::locale::locale;
/// use icu::locale::Locale;
///
/// let loc1 = Locale::try_from_str("und-t-h0-hybrid").unwrap();
/// let loc2 = locale!("und-u-ca-buddhist");
///
/// assert!(loc1.extensions.unicode.keywords.is_empty());
/// assert!(!loc2.extensions.unicode.keywords.is_empty());
/// ```
pub fn is_empty(&self) -> bool {
self.0.is_empty()
}
/// Returns `true` if the list contains a [`Value`] for the specified [`Key`].
///
///
/// # Examples
///
/// ```
/// use icu::locale::extensions::unicode::{key, value, Keywords};
///
/// let keywords = [(key!("ca"), value!("gregory"))]
/// .into_iter()
/// .collect::<Keywords>();
///
/// assert!(&keywords.contains_key(&key!("ca")));
/// ```
pub fn contains_key<Q>(&self, key: &Q) -> bool
where
Key: Borrow<Q>,
Q: Ord,
{
self.0.contains_key(key)
}
/// Returns a reference to the [`Value`] corresponding to the [`Key`].
///
///
/// # Examples
///
/// ```
/// use icu::locale::extensions::unicode::{key, value, Keywords};
///
/// let keywords = [(key!("ca"), value!("buddhist"))]
/// .into_iter()
/// .collect::<Keywords>();
///
/// assert_eq!(keywords.get(&key!("ca")), Some(&value!("buddhist")));
/// ```
pub fn get<Q>(&self, key: &Q) -> Option<&Value>
where
Key: Borrow<Q>,
Q: Ord,
{
self.0.get(key)
}
/// Returns a mutable reference to the [`Value`] corresponding to the [`Key`].
///
/// Returns `None` if the key doesn't exist or if the key has no value.
///
/// ✨ *Enabled with the `alloc` Cargo feature.*
///
/// # Examples
///
/// ```
/// use icu::locale::extensions::unicode::{key, value, Keywords};
///
/// let mut keywords = [(key!("ca"), value!("buddhist"))]
/// .into_iter()
/// .collect::<Keywords>();
///
/// if let Some(value) = keywords.get_mut(&key!("ca")) {
/// *value = value!("gregory");
/// }
/// assert_eq!(keywords.get(&key!("ca")), Some(&value!("gregory")));
/// ```
#[cfg(feature = "alloc")]
pub fn get_mut<Q>(&mut self, key: &Q) -> Option<&mut Value>
where
Key: Borrow<Q>,
Q: Ord,
{
self.0.get_mut(key)
}
/// Sets the specified keyword, returning the old value if it already existed.
///
/// ✨ *Enabled with the `alloc` Cargo feature.*
///
/// # Examples
///
/// ```
/// use icu::locale::extensions::unicode::{key, value};
/// use icu::locale::Locale;
///
/// let mut loc: Locale = "und-u-hello-ca-buddhist-hc-h12"
/// .parse()
/// .expect("valid BCP-47 identifier");
/// let old_value = loc
/// .extensions
/// .unicode
/// .keywords
/// .set(key!("ca"), value!("japanese"));
///
/// assert_eq!(old_value, Some(value!("buddhist")));
/// assert_eq!(loc, "und-u-hello-ca-japanese-hc-h12".parse().unwrap());
/// ```
#[cfg(feature = "alloc")]
pub fn set(&mut self, key: Key, value: Value) -> Option<Value> {
self.0.insert(key, value)
}
/// Removes the specified keyword, returning the old value if it existed.
///
/// ✨ *Enabled with the `alloc` Cargo feature.*
///
/// # Examples
///
/// ```
/// use icu::locale::extensions::unicode::key;
/// use icu::locale::Locale;
///
/// let mut loc: Locale = "und-u-hello-ca-buddhist-hc-h12"
/// .parse()
/// .expect("valid BCP-47 identifier");
/// loc.extensions.unicode.keywords.remove(key!("ca"));
/// assert_eq!(loc, "und-u-hello-hc-h12".parse().unwrap());
/// ```
#[cfg(feature = "alloc")]
pub fn remove<Q: Borrow<Key>>(&mut self, key: Q) -> Option<Value> {
self.0.remove(key.borrow())
}
/// Clears all Unicode extension keywords, leaving Unicode attributes.
///
/// Returns the old Unicode extension keywords.
///
/// # Examples
///
/// ```
/// use icu::locale::Locale;
///
/// let mut loc: Locale = "und-u-hello-ca-buddhist-hc-h12".parse().unwrap();
/// loc.extensions.unicode.keywords.clear();
/// assert_eq!(loc, "und-u-hello".parse().unwrap());
/// ```
pub fn clear(&mut self) -> Self {
core::mem::take(self)
}
/// Retains a subset of keywords as specified by the predicate function.
///
/// ✨ *Enabled with the `alloc` Cargo feature.*
///
/// # Examples
///
/// ```
/// use icu::locale::extensions::unicode::key;
/// use icu::locale::Locale;
///
/// let mut loc: Locale = "und-u-ca-buddhist-hc-h12-ms-metric".parse().unwrap();
///
/// loc.extensions
/// .unicode
/// .keywords
/// .retain_by_key(|&k| k == key!("hc"));
/// assert_eq!(loc, "und-u-hc-h12".parse().unwrap());
///
/// loc.extensions
/// .unicode
/// .keywords
/// .retain_by_key(|&k| k == key!("ms"));
/// assert_eq!(loc, Locale::UNKNOWN);
/// ```
#[cfg(feature = "alloc")]
pub fn retain_by_key<F>(&mut self, mut predicate: F)
where
F: FnMut(&Key) -> bool,
{
self.0.retain(|k, _| predicate(k))
}
/// Compare this [`Keywords`] with BCP-47 bytes.
///
/// The return value is equivalent to what would happen if you first converted this
/// [`Keywords`] to a BCP-47 string and then performed a byte comparison.
///
/// This function is case-sensitive and results in a *total order*, so it is appropriate for
/// binary search. The only argument producing [`Ordering::Equal`] is `self.to_string()`.
///
/// # Examples
///
/// ```
/// use icu::locale::Locale;
/// use std::cmp::Ordering;
///
/// let bcp47_strings: &[&str] =
/// &["ca-hebrew", "ca-japanese", "ca-japanese-nu-latn", "nu-latn"];
///
/// for ab in bcp47_strings.windows(2) {
/// let a = ab[0];
/// let b = ab[1];
/// assert!(a.cmp(b) == Ordering::Less);
/// let a_kwds = format!("und-u-{}", a)
/// .parse::<Locale>()
/// .unwrap()
/// .extensions
/// .unicode
/// .keywords;
/// assert!(a_kwds.strict_cmp(a.as_bytes()) == Ordering::Equal);
/// assert!(a_kwds.strict_cmp(b.as_bytes()) == Ordering::Less);
/// }
/// ```
pub fn strict_cmp(&self, other: &[u8]) -> Ordering {
writeable::cmp_utf8(self, other)
}
#[cfg(feature = "alloc")]
pub(crate) fn try_from_iter(iter: &mut SubtagIterator) -> Result<Self, ParseError> {
let mut keywords = LiteMap::new();
let mut current_keyword = None;
let mut current_value = ShortBoxSlice::new();
while let Some(subtag) = iter.peek() {
let slen = subtag.len();
if slen == 2 {
if let Some(kw) = current_keyword.take() {
keywords.try_insert(kw, Value::from_short_slice_unchecked(current_value));
current_value = ShortBoxSlice::new();
}
current_keyword = Some(Key::try_from_utf8(subtag)?);
} else if current_keyword.is_some() {
match Value::parse_subtag_from_utf8(subtag) {
Ok(Some(t)) => current_value.push(t),
Ok(None) => {}
Err(_) => break,
}
} else {
break;
}
iter.next();
}
if let Some(kw) = current_keyword.take() {
keywords.try_insert(kw, Value::from_short_slice_unchecked(current_value));
}
Ok(keywords.into())
}
/// Produce an ordered iterator over key-value pairs
pub fn iter(&self) -> impl Iterator<Item = (&Key, &Value)> {
self.0.iter()
}
pub(crate) fn for_each_subtag_str<E, F>(&self, f: &mut F) -> Result<(), E>
where
F: FnMut(&str) -> Result<(), E>,
{
for (k, v) in self.0.iter() {
f(k.as_str())?;
v.for_each_subtag_str(f)?;
}
Ok(())
}
/// Extends the `Keywords` with values from another `Keywords`.
///
/// # Example
///
/// ```
/// use icu::locale::extensions::unicode::Keywords;
///
/// let mut kw: Keywords = "ab-cd-ca-buddhist".parse().unwrap();
/// let kw2: Keywords = "ca-gregory-hc-h12".parse().unwrap();
///
/// kw.extend_from_keywords(kw2);
///
/// assert_eq!(kw, "ab-cd-ca-gregory-hc-h12".parse().unwrap());
/// ```
#[cfg(feature = "alloc")]
pub fn extend_from_keywords(&mut self, other: Keywords) {
for (key, value) in other.0 {
self.0.insert(key, value);
}
}
/// This needs to be its own method to help with type inference in helpers.rs
#[cfg(test)]
pub(crate) fn from_tuple_vec(v: Vec<(Key, Value)>) -> Self {
v.into_iter().collect()
}
}
impl From<LiteMap<Key, Value, ShortBoxSlice<(Key, Value)>>> for Keywords {
fn from(map: LiteMap<Key, Value, ShortBoxSlice<(Key, Value)>>) -> Self {
Self(map)
}
}
/// ✨ *Enabled with the `alloc` Cargo feature.*
#[cfg(feature = "alloc")]
impl FromIterator<(Key, Value)> for Keywords {
fn from_iter<I: IntoIterator<Item = (Key, Value)>>(iter: I) -> Self {
LiteMap::from_iter(iter).into()
}
}
/// ✨ *Enabled with the `alloc` Cargo feature.*
#[cfg(feature = "alloc")]
impl FromStr for Keywords {
type Err = ParseError;
#[inline]
fn from_str(s: &str) -> Result<Self, Self::Err> {
Self::try_from_str(s)
}
}
impl_writeable_for_key_value!(Keywords, "ca", "islamic-civil", "mm", "mm");
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_keywords_fromstr() {
let kw: Keywords = "hc-h12".parse().expect("Failed to parse Keywords");
assert_eq!(kw.to_string(), "hc-h12");
}
}

View File

@@ -0,0 +1,294 @@
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
//! Unicode Extensions provide information about user preferences in a given locale.
//!
//! The main struct for this extension is [`Unicode`] which contains [`Keywords`] and
//! [`Attributes`].
//!
//!
//! # Examples
//!
//! ```
//! use icu::locale::extensions::unicode::{attribute, key, value, Unicode};
//! use icu::locale::Locale;
//!
//! let loc: Locale = "en-US-u-foobar-hc-h12".parse().expect("Parsing failed.");
//!
//! assert_eq!(
//! loc.extensions.unicode.keywords.get(&key!("hc")),
//! Some(&value!("h12"))
//! );
//! assert!(loc
//! .extensions
//! .unicode
//! .attributes
//! .contains(&attribute!("foobar")));
//! ```
mod attribute;
mod attributes;
mod key;
mod keywords;
mod subdivision;
mod value;
use core::cmp::Ordering;
#[cfg(feature = "alloc")]
use core::str::FromStr;
#[doc(inline)]
pub use attribute::{attribute, Attribute};
pub use attributes::Attributes;
#[doc(inline)]
pub use key::{key, Key};
pub use keywords::Keywords;
#[doc(inline)]
pub use subdivision::{subdivision_suffix, SubdivisionId, SubdivisionSuffix};
#[doc(inline)]
pub use value::{value, Value};
#[cfg(feature = "alloc")]
use super::ExtensionType;
#[cfg(feature = "alloc")]
use crate::parser::ParseError;
#[cfg(feature = "alloc")]
use crate::parser::SubtagIterator;
pub(crate) const UNICODE_EXT_CHAR: char = 'u';
pub(crate) const UNICODE_EXT_STR: &str = "u";
/// Unicode Extensions provide information about user preferences in a given locale.
///
/// A list of [`Unicode BCP47 U Extensions`] as defined in [`Unicode Locale
/// Identifier`] specification.
///
/// Unicode extensions provide subtags that specify language and/or locale-based behavior
/// or refinements to language tags, according to work done by the Unicode Consortium.
/// (See [`RFC 6067`] for details).
///
/// [`Unicode BCP47 U Extensions`]: https://unicode.org/reports/tr35/#u_Extension
/// [`RFC 6067`]: https://www.ietf.org/rfc/rfc6067.txt
/// [`Unicode Locale Identifier`]: https://unicode.org/reports/tr35/#Unicode_locale_identifier
///
/// # Examples
///
/// ```
/// use icu::locale::extensions::unicode::{key, value};
/// use icu::locale::Locale;
///
/// let loc: Locale =
/// "de-u-hc-h12-ca-buddhist".parse().expect("Parsing failed.");
///
/// assert_eq!(
/// loc.extensions.unicode.keywords.get(&key!("ca")),
/// Some(&value!("buddhist"))
/// );
/// ```
#[derive(Clone, PartialEq, Eq, Debug, Default, Hash)]
#[allow(clippy::exhaustive_structs)] // spec-backed stable datastructure
pub struct Unicode {
/// The key-value pairs present in this locale extension, with each extension key subtag
/// associated to its provided value subtag.
pub keywords: Keywords,
/// A canonically ordered sequence of single standalone subtags for this locale extension.
pub attributes: Attributes,
}
impl Unicode {
/// Returns a new empty map of Unicode extensions. Same as [`default()`](Default::default()), but is `const`.
///
/// # Examples
///
/// ```
/// use icu::locale::extensions::unicode::Unicode;
///
/// assert_eq!(Unicode::new(), Unicode::default());
/// ```
#[inline]
pub const fn new() -> Self {
Self {
keywords: Keywords::new(),
attributes: Attributes::new(),
}
}
/// A constructor which takes a str slice, parses it and
/// produces a well-formed [`Unicode`].
///
/// ✨ *Enabled with the `alloc` Cargo feature.*
#[inline]
#[cfg(feature = "alloc")]
pub fn try_from_str(s: &str) -> Result<Self, ParseError> {
Self::try_from_utf8(s.as_bytes())
}
/// See [`Self::try_from_str`]
///
/// ✨ *Enabled with the `alloc` Cargo feature.*
#[cfg(feature = "alloc")]
pub fn try_from_utf8(code_units: &[u8]) -> Result<Self, ParseError> {
let mut iter = SubtagIterator::new(code_units);
let ext = iter.next().ok_or(ParseError::InvalidExtension)?;
if let ExtensionType::Unicode = ExtensionType::try_from_byte_slice(ext)? {
return Self::try_from_iter(&mut iter);
}
Err(ParseError::InvalidExtension)
}
/// Returns [`true`] if there list of keywords and attributes is empty.
///
/// # Examples
///
/// ```
/// use icu::locale::Locale;
///
/// let loc: Locale = "en-US-u-foo".parse().expect("Parsing failed.");
///
/// assert!(!loc.extensions.unicode.is_empty());
/// ```
pub fn is_empty(&self) -> bool {
self.keywords.is_empty() && self.attributes.is_empty()
}
/// Clears all Unicode extension keywords and attributes, effectively removing
/// the Unicode extension.
///
/// # Example
///
/// ```
/// use icu::locale::Locale;
///
/// let mut loc: Locale =
/// "und-t-mul-u-hello-ca-buddhist-hc-h12".parse().unwrap();
/// loc.extensions.unicode.clear();
/// assert_eq!(loc, "und-t-mul".parse().unwrap());
/// ```
pub fn clear(&mut self) {
self.keywords.clear();
self.attributes.clear();
}
pub(crate) fn as_tuple(&self) -> (&Attributes, &Keywords) {
(&self.attributes, &self.keywords)
}
/// Returns an ordering suitable for use in [`BTreeSet`].
///
/// The ordering may or may not be equivalent to string ordering, and it
/// may or may not be stable across ICU4X releases.
///
/// [`BTreeSet`]: alloc::collections::BTreeSet
pub fn total_cmp(&self, other: &Self) -> Ordering {
self.as_tuple().cmp(&other.as_tuple())
}
#[cfg(feature = "alloc")]
pub(crate) fn try_from_iter(iter: &mut SubtagIterator) -> Result<Self, ParseError> {
let attributes = Attributes::try_from_iter(iter)?;
let keywords = Keywords::try_from_iter(iter)?;
// Ensure we've defined at least one attribute or keyword
if attributes.is_empty() && keywords.is_empty() {
return Err(ParseError::InvalidExtension);
}
Ok(Self {
keywords,
attributes,
})
}
pub(crate) fn for_each_subtag_str<E, F>(&self, f: &mut F, with_ext: bool) -> Result<(), E>
where
F: FnMut(&str) -> Result<(), E>,
{
if !self.is_empty() {
if with_ext {
f(UNICODE_EXT_STR)?;
}
self.attributes.for_each_subtag_str(f)?;
self.keywords.for_each_subtag_str(f)?;
}
Ok(())
}
/// Extends the `Unicode` with values from another `Unicode`.
///
/// # Example
///
/// ```
/// use icu::locale::extensions::unicode::Unicode;
///
/// let mut ue: Unicode = "u-foobar-ca-buddhist".parse().unwrap();
/// let ue2: Unicode = "u-ca-gregory-hc-h12".parse().unwrap();
///
/// ue.extend(ue2);
///
/// assert_eq!(ue, "u-foobar-ca-gregory-hc-h12".parse().unwrap());
/// ```
#[cfg(feature = "alloc")]
pub fn extend(&mut self, other: Unicode) {
self.keywords.extend_from_keywords(other.keywords);
self.attributes.extend_from_attributes(other.attributes);
}
}
/// ✨ *Enabled with the `alloc` Cargo feature.*
#[cfg(feature = "alloc")]
impl FromStr for Unicode {
type Err = ParseError;
#[inline]
fn from_str(s: &str) -> Result<Self, Self::Err> {
Self::try_from_str(s)
}
}
writeable::impl_display_with_writeable!(Unicode, #[cfg(feature = "alloc")]);
impl writeable::Writeable for Unicode {
fn write_to<W: core::fmt::Write + ?Sized>(&self, sink: &mut W) -> core::fmt::Result {
sink.write_char(UNICODE_EXT_CHAR)?;
if !self.attributes.is_empty() {
sink.write_char('-')?;
writeable::Writeable::write_to(&self.attributes, sink)?;
}
if !self.keywords.is_empty() {
sink.write_char('-')?;
writeable::Writeable::write_to(&self.keywords, sink)?;
}
Ok(())
}
fn writeable_length_hint(&self) -> writeable::LengthHint {
if self.is_empty() {
return writeable::LengthHint::exact(0);
}
let mut result = writeable::LengthHint::exact(1);
if !self.attributes.is_empty() {
result += writeable::Writeable::writeable_length_hint(&self.attributes) + 1;
}
if !self.keywords.is_empty() {
result += writeable::Writeable::writeable_length_hint(&self.keywords) + 1;
}
result
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_unicode_extension_fromstr() {
let ue: Unicode = "u-foo-hc-h12".parse().expect("Failed to parse Unicode");
assert_eq!(ue.to_string(), "u-foo-hc-h12");
let ue: Result<Unicode, _> = "u".parse();
assert!(ue.is_err());
}
}

View File

@@ -0,0 +1,181 @@
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
use core::str::FromStr;
use crate::parser::ParseError;
use crate::subtags::{Region, Subtag};
impl_tinystr_subtag!(
/// A subdivision suffix used in [`SubdivisionId`].
///
/// This suffix represents a specific subdivision code under a given [`Region`].
/// For example the value of [`SubdivisionId`] may be `gbsct`, where the [`SubdivisionSuffix`]
/// is `sct` for Scotland.
///
/// Such a value associated with a key `rg` means that the locale should use Unit Preferences
/// (default calendar, currency, week data, time cycle, measurement system) for Scotland, even if the
/// [`LanguageIdentifier`](crate::LanguageIdentifier) is `en-US`.
///
/// A subdivision suffix has to be a sequence of alphanumerical characters no
/// shorter than one and no longer than four characters.
///
///
/// # Examples
///
/// ```
/// use icu::locale::extensions::unicode::{subdivision_suffix, SubdivisionSuffix};
///
/// let ss: SubdivisionSuffix =
/// "sct".parse().expect("Failed to parse a SubdivisionSuffix.");
///
/// assert_eq!(ss, subdivision_suffix!("sct"));
/// ```
SubdivisionSuffix,
extensions::unicode,
subdivision_suffix,
extensions_unicode_subdivision_suffix,
1..=4,
s,
s.is_ascii_alphanumeric(),
s.to_ascii_lowercase(),
s.is_ascii_alphanumeric() && s.is_ascii_lowercase(),
InvalidExtension,
["sct"],
["toolooong"],
);
/// A Subivision Id as defined in [`Unicode Locale Identifier`].
///
/// Subdivision Id is used in [`Unicode`] extensions:
/// * `rg` - Regional Override
/// * `sd` - Regional Subdivision
///
/// In both cases the subdivision is composed of a [`Region`] and a [`SubdivisionSuffix`] which represents
/// different meaning depending on the key.
///
/// [`Unicode Locale Identifier`]: https://unicode.org/reports/tr35/tr35.html#unicode_subdivision_id
/// [`Unicode`]: crate::extensions::unicode::Unicode
///
/// # Examples
///
/// ```
/// use icu::locale::{
/// extensions::unicode::{subdivision_suffix, SubdivisionId},
/// subtags::region,
/// };
///
/// let ss = subdivision_suffix!("zzzz");
/// let region = region!("gb");
///
/// let si = SubdivisionId::new(region, ss);
///
/// assert_eq!(si.to_string(), "gbzzzz");
/// ```
#[derive(Debug, PartialEq, Eq, Clone, Hash, PartialOrd, Ord, Copy)]
#[non_exhaustive]
pub struct SubdivisionId {
/// A region field of a Subdivision Id.
pub region: Region,
/// A subdivision suffix field of a Subdivision Id.
pub suffix: SubdivisionSuffix,
}
impl SubdivisionId {
/// Returns a new [`SubdivisionId`].
///
/// # Examples
///
/// ```
/// use icu::locale::{
/// extensions::unicode::{subdivision_suffix, SubdivisionId},
/// subtags::region,
/// };
///
/// let ss = subdivision_suffix!("zzzz");
/// let region = region!("gb");
///
/// let si = SubdivisionId::new(region, ss);
///
/// assert_eq!(si.to_string(), "gbzzzz");
/// ```
pub const fn new(region: Region, suffix: SubdivisionSuffix) -> Self {
Self { region, suffix }
}
/// A constructor which takes a str slice, parses it and
/// produces a well-formed [`SubdivisionId`].
#[inline]
pub fn try_from_str(s: &str) -> Result<Self, ParseError> {
Self::try_from_utf8(s.as_bytes())
}
/// See [`Self::try_from_str`]
pub fn try_from_utf8(code_units: &[u8]) -> Result<Self, ParseError> {
let is_alpha = code_units
.first()
.and_then(|b| {
b.is_ascii_alphabetic()
.then_some(true)
.or_else(|| b.is_ascii_digit().then_some(false))
})
.ok_or(ParseError::InvalidExtension)?;
let region_len = if is_alpha { 2 } else { 3 };
let (region_code_units, suffix_code_units) = code_units
.split_at_checked(region_len)
.ok_or(ParseError::InvalidExtension)?;
let region =
Region::try_from_utf8(region_code_units).map_err(|_| ParseError::InvalidExtension)?;
let suffix = SubdivisionSuffix::try_from_utf8(suffix_code_units)?;
Ok(Self { region, suffix })
}
/// Convert to [`Subtag`]
pub fn into_subtag(self) -> Subtag {
let result = self.region.to_tinystr().concat(self.suffix.to_tinystr());
Subtag::from_tinystr_unvalidated(result)
}
}
impl writeable::Writeable for SubdivisionId {
#[inline]
fn write_to<W: core::fmt::Write + ?Sized>(&self, sink: &mut W) -> core::fmt::Result {
sink.write_str(self.region.to_tinystr().to_ascii_lowercase().as_str())?;
sink.write_str(self.suffix.as_str())
}
#[inline]
fn writeable_length_hint(&self) -> writeable::LengthHint {
self.region.writeable_length_hint() + self.suffix.writeable_length_hint()
}
}
writeable::impl_display_with_writeable!(SubdivisionId, #[cfg(feature = "alloc")]);
impl FromStr for SubdivisionId {
type Err = ParseError;
#[inline]
fn from_str(s: &str) -> Result<Self, Self::Err> {
Self::try_from_str(s)
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_subdivisionid_fromstr() {
let si: SubdivisionId = "gbzzzz".parse().expect("Failed to parse SubdivisionId");
assert_eq!(si.region.to_string(), "GB");
assert_eq!(si.suffix.to_string(), "zzzz");
assert_eq!(si.to_string(), "gbzzzz");
for sample in ["", "gb", "o"] {
let oe: Result<SubdivisionId, _> = sample.parse();
assert!(oe.is_err(), "Should fail: {sample}");
}
}
}

View File

@@ -0,0 +1,377 @@
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
use crate::parser::ParseError;
use crate::parser::SubtagIterator;
use crate::shortvec::{ShortBoxSlice, ShortBoxSliceIntoIter};
use crate::subtags::{subtag, Subtag};
#[cfg(feature = "alloc")]
use alloc::vec::Vec;
#[cfg(feature = "alloc")]
use core::str::FromStr;
/// A value used in a list of [`Keywords`](super::Keywords).
///
/// The value has to be a sequence of one or more alphanumerical strings
/// separated by `-`.
/// Each part of the sequence has to be no shorter than three characters and no
/// longer than 8.
///
///
/// # Examples
///
/// ```
/// use icu::locale::extensions::unicode::{value, Value};
/// use writeable::assert_writeable_eq;
///
/// assert_writeable_eq!(value!("gregory"), "gregory");
/// assert_writeable_eq!(
/// "islamic-civil".parse::<Value>().unwrap(),
/// "islamic-civil"
/// );
///
/// // The value "true" has the special, empty string representation
/// assert_eq!(value!("true").to_string(), "");
/// ```
#[derive(Debug, PartialEq, Eq, Clone, Hash, PartialOrd, Ord, Default)]
pub struct Value(ShortBoxSlice<Subtag>);
const TRUE_VALUE: Subtag = subtag!("true");
impl Value {
/// A constructor which str slice, parses it and
/// produces a well-formed [`Value`].
///
/// # Examples
///
/// ```
/// use icu::locale::extensions::unicode::Value;
///
/// Value::try_from_str("buddhist").expect("Parsing failed.");
/// ```
///
/// # `alloc` Cargo feature
///
/// Without the `alloc` Cargo feature, this only supports parsing
/// up to two (non-`true`) subtags, and will return an error for
/// longer strings.
#[inline]
pub fn try_from_str(s: &str) -> Result<Self, ParseError> {
Self::try_from_utf8(s.as_bytes())
}
/// See [`Self::try_from_str`]
pub fn try_from_utf8(code_units: &[u8]) -> Result<Self, ParseError> {
let mut v = ShortBoxSlice::new();
if !code_units.is_empty() {
for chunk in SubtagIterator::new(code_units) {
let subtag = Subtag::try_from_utf8(chunk)?;
if subtag != TRUE_VALUE {
#[cfg(feature = "alloc")]
v.push(subtag);
#[cfg(not(feature = "alloc"))]
if v.is_empty() {
v = ShortBoxSlice::new_single(subtag);
} else if let &[prev] = &*v {
v = ShortBoxSlice::new_double(prev, subtag);
} else {
return Err(ParseError::InvalidSubtag);
}
}
}
}
Ok(Self(v))
}
/// Returns a reference to a single [`Subtag`] if the [`Value`] contains exactly one
/// subtag, or `None` otherwise.
///
/// # Examples
///
/// ```
/// use core::str::FromStr;
/// use icu::locale::extensions::unicode::Value;
///
/// let value1 = Value::from_str("foo").expect("failed to parse a Value");
/// let value2 = Value::from_str("foo-bar").expect("failed to parse a Value");
///
/// assert!(value1.as_single_subtag().is_some());
/// assert!(value2.as_single_subtag().is_none());
/// ```
pub const fn as_single_subtag(&self) -> Option<&Subtag> {
self.0.single()
}
/// Destructs into a single [`Subtag`] if the [`Value`] contains exactly one
/// subtag, or returns `None` otherwise.
///
/// # Examples
///
/// ```
/// use core::str::FromStr;
/// use icu::locale::extensions::unicode::Value;
///
/// let value1 = Value::from_str("foo").expect("failed to parse a Value");
/// let value2 = Value::from_str("foo-bar").expect("failed to parse a Value");
///
/// assert!(value1.into_single_subtag().is_some());
/// assert!(value2.into_single_subtag().is_none());
/// ```
pub fn into_single_subtag(self) -> Option<Subtag> {
self.0.into_single()
}
#[doc(hidden)]
pub fn as_subtags_slice(&self) -> &[Subtag] {
&self.0
}
/// Appends a subtag to the back of a [`Value`].
///
/// ✨ *Enabled with the `alloc` Cargo feature.*
///
/// # Examples
///
/// ```
/// use icu::locale::{extensions::unicode::Value, subtags::subtag};
///
/// let mut v = Value::default();
/// v.push_subtag(subtag!("foo"));
/// v.push_subtag(subtag!("bar"));
/// assert_eq!(v, "foo-bar");
/// ```
#[cfg(feature = "alloc")]
pub fn push_subtag(&mut self, subtag: Subtag) {
self.0.push(subtag);
}
/// Returns the number of subtags in the [`Value`].
///
/// # Examples
///
/// ```
/// use icu::locale::{extensions::unicode::Value, subtags::subtag};
///
/// let mut v = Value::default();
/// assert_eq!(v.subtag_count(), 0);
/// v.push_subtag(subtag!("foo"));
/// assert_eq!(v.subtag_count(), 1);
/// ```
pub fn subtag_count(&self) -> usize {
self.0.len()
}
/// Creates an empty [`Value`], which corresponds to a "true" value.
///
/// # Examples
///
/// ```
/// use icu::locale::extensions::unicode::{value, Value};
///
/// assert_eq!(value!("true"), Value::new_empty());
/// ```
pub const fn new_empty() -> Self {
Self(ShortBoxSlice::new())
}
/// Returns `true` if the Value has no subtags.
///
/// # Examples
///
/// ```
/// use icu::locale::{extensions::unicode::Value, subtags::subtag};
///
/// let mut v = Value::default();
/// assert!(v.is_empty());
/// ```
pub fn is_empty(&self) -> bool {
self.0.is_empty()
}
/// Removes and returns the subtag at position `index` within the value,
/// shifting all subtags after it to the left.
///
/// # Examples
///
/// ```
/// use icu::locale::{extensions::unicode::Value, subtags::subtag};
/// let mut v = Value::default();
/// v.push_subtag(subtag!("foo"));
/// v.push_subtag(subtag!("bar"));
/// v.push_subtag(subtag!("baz"));
///
/// assert_eq!(v.remove_subtag(1), Some(subtag!("bar")));
/// assert_eq!(v, "foo-baz");
/// ```
pub fn remove_subtag(&mut self, idx: usize) -> Option<Subtag> {
if self.0.len() < idx {
None
} else {
let item = self.0.remove(idx);
Some(item)
}
}
/// Returns a reference to a subtag at index.
///
/// # Examples
///
/// ```
/// use icu::locale::{extensions::unicode::Value, subtags::subtag};
/// let mut v = Value::default();
/// v.push_subtag(subtag!("foo"));
/// v.push_subtag(subtag!("bar"));
/// v.push_subtag(subtag!("baz"));
///
/// assert_eq!(v.get_subtag(1), Some(&subtag!("bar")));
/// assert_eq!(v.get_subtag(3), None);
/// ```
pub fn get_subtag(&self, idx: usize) -> Option<&Subtag> {
self.0.get(idx)
}
#[doc(hidden)]
pub const fn from_subtag(subtag: Option<Subtag>) -> Self {
match subtag {
None | Some(TRUE_VALUE) => Self(ShortBoxSlice::new()),
Some(val) => Self(ShortBoxSlice::new_single(val)),
}
}
#[doc(hidden)]
pub fn from_two_subtags(f: Subtag, s: Subtag) -> Self {
Self(ShortBoxSlice::new_double(f, s))
}
/// A constructor which takes a pre-sorted list of [`Value`] elements.
///
/// ✨ *Enabled with the `alloc` Cargo feature.*
///
/// # Examples
///
/// ```
/// use icu::locale::extensions::unicode::Value;
/// use icu::locale::subtags::subtag;
///
/// let subtag1 = subtag!("foobar");
/// let subtag2 = subtag!("testing");
/// let mut v = vec![subtag1, subtag2];
/// v.sort();
/// v.dedup();
///
/// let value = Value::from_vec_unchecked(v);
/// ```
///
/// Notice: For performance- and memory-constrained environments, it is recommended
/// for the caller to use [`binary_search`](slice::binary_search) instead of [`sort`](slice::sort)
/// and [`dedup`](Vec::dedup()).
#[cfg(feature = "alloc")]
pub fn from_vec_unchecked(input: Vec<Subtag>) -> Self {
Self(input.into())
}
#[allow(dead_code)]
pub(crate) fn from_short_slice_unchecked(input: ShortBoxSlice<Subtag>) -> Self {
Self(input)
}
pub(crate) const fn parse_subtag_from_utf8(t: &[u8]) -> Result<Option<Subtag>, ParseError> {
match Subtag::try_from_utf8(t) {
Ok(TRUE_VALUE) => Ok(None),
Ok(s) => Ok(Some(s)),
Err(_) => Err(ParseError::InvalidSubtag),
}
}
pub(crate) fn for_each_subtag_str<E, F>(&self, f: &mut F) -> Result<(), E>
where
F: FnMut(&str) -> Result<(), E>,
{
self.0.iter().map(Subtag::as_str).try_for_each(f)
}
}
impl IntoIterator for Value {
type Item = Subtag;
type IntoIter = ShortBoxSliceIntoIter<Subtag>;
fn into_iter(self) -> Self::IntoIter {
self.0.into_iter()
}
}
/// ✨ *Enabled with the `alloc` Cargo feature.*
#[cfg(feature = "alloc")]
impl FromIterator<Subtag> for Value {
fn from_iter<T: IntoIterator<Item = Subtag>>(iter: T) -> Self {
Self(ShortBoxSlice::from_iter(iter))
}
}
/// ✨ *Enabled with the `alloc` Cargo feature.*
#[cfg(feature = "alloc")]
impl Extend<Subtag> for Value {
fn extend<T: IntoIterator<Item = Subtag>>(&mut self, iter: T) {
for i in iter {
self.0.push(i);
}
}
}
/// ✨ *Enabled with the `alloc` Cargo feature.*
#[cfg(feature = "alloc")]
impl FromStr for Value {
type Err = ParseError;
#[inline]
fn from_str(s: &str) -> Result<Self, Self::Err> {
Self::try_from_str(s)
}
}
impl PartialEq<&str> for Value {
fn eq(&self, other: &&str) -> bool {
writeable::cmp_utf8(self, other.as_bytes()).is_eq()
}
}
impl_writeable_for_subtag_list!(Value, "islamic", "civil");
/// A macro allowing for compile-time construction of valid Unicode [`Value`] subtag.
///
/// The macro only supports single-subtag values.
///
/// # Examples
///
/// ```
/// use icu::locale::extensions::unicode::{key, value};
/// use icu::locale::Locale;
///
/// let loc: Locale = "de-u-ca-buddhist".parse().unwrap();
///
/// assert_eq!(
/// loc.extensions.unicode.keywords.get(&key!("ca")),
/// Some(&value!("buddhist"))
/// );
/// ```
///
/// [`Value`]: crate::extensions::unicode::Value
#[macro_export]
#[doc(hidden)] // macro
macro_rules! extensions_unicode_value {
($value:literal) => {
const {
$crate::extensions::unicode::Value::from_subtag(
match $crate::subtags::Subtag::try_from_utf8($value.as_bytes()) {
Ok(r) => Some(r),
_ => panic!(concat!("Invalid Unicode extension value: ", $value)),
},
)
}
};
}
#[doc(inline)]
pub use extensions_unicode_value as value;

427
vendor/icu_locale_core/src/helpers.rs vendored Normal file
View File

@@ -0,0 +1,427 @@
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
macro_rules! impl_tinystr_subtag {
(
$(#[$doc:meta])*
$name:ident,
$($path:ident)::+,
$macro_name:ident,
$internal_macro_name:ident,
$len_start:literal..=$len_end:literal,
$tinystr_ident:ident,
$validate:expr,
$normalize:expr,
$is_normalized:expr,
$error:ident,
[$good_example:literal $(,$more_good_examples:literal)*],
[$bad_example:literal $(, $more_bad_examples:literal)*],
) => {
#[derive(Debug, PartialEq, Eq, Clone, Hash, PartialOrd, Ord, Copy)]
#[repr(transparent)]
$(#[$doc])*
pub struct $name(tinystr::TinyAsciiStr<$len_end>);
impl $name {
/// A constructor which takes a str slice, parses it and
#[doc = concat!("produces a well-formed [`", stringify!($name), "`].")]
///
/// # Examples
///
/// ```
#[doc = concat!("use icu_locale_core::", stringify!($($path::)+), stringify!($name), ";")]
///
#[doc = concat!("assert!(", stringify!($name), "::try_from_str(", stringify!($good_example), ").is_ok());")]
#[doc = concat!("assert!(", stringify!($name), "::try_from_str(", stringify!($bad_example), ").is_err());")]
/// ```
#[inline]
pub const fn try_from_str(s: &str) -> Result<Self, crate::parser::errors::ParseError> {
Self::try_from_utf8(s.as_bytes())
}
/// See [`Self::try_from_str`]
pub const fn try_from_utf8(
code_units: &[u8],
) -> Result<Self, crate::parser::errors::ParseError> {
if code_units.len() < $len_start || code_units.len() > $len_end {
return Err(crate::parser::errors::ParseError::$error);
}
match tinystr::TinyAsciiStr::try_from_utf8(code_units) {
Ok($tinystr_ident) if $validate => Ok(Self($normalize)),
_ => Err(crate::parser::errors::ParseError::$error),
}
}
#[doc = concat!("Safely creates a [`", stringify!($name), "`] from its raw format")]
/// as returned by [`Self::into_raw`]. Unlike [`Self::try_from_utf8`],
/// this constructor only takes normalized values.
pub const fn try_from_raw(
raw: [u8; $len_end],
) -> Result<Self, crate::parser::errors::ParseError> {
if let Ok($tinystr_ident) = tinystr::TinyAsciiStr::<$len_end>::try_from_raw(raw) {
if $tinystr_ident.len() >= $len_start && $is_normalized {
Ok(Self($tinystr_ident))
} else {
Err(crate::parser::errors::ParseError::$error)
}
} else {
Err(crate::parser::errors::ParseError::$error)
}
}
#[doc = concat!("Unsafely creates a [`", stringify!($name), "`] from its raw format")]
/// as returned by [`Self::into_raw`]. Unlike [`Self::try_from_utf8`],
/// this constructor only takes normalized values.
///
/// # Safety
///
/// This function is safe iff [`Self::try_from_raw`] returns an `Ok`. This is the case
/// for inputs that are correctly normalized.
pub const unsafe fn from_raw_unchecked(v: [u8; $len_end]) -> Self {
Self(tinystr::TinyAsciiStr::from_utf8_unchecked(v))
}
/// Deconstructs into a raw format to be consumed by
/// [`from_raw_unchecked`](Self::from_raw_unchecked()) or
/// [`try_from_raw`](Self::try_from_raw()).
pub const fn into_raw(self) -> [u8; $len_end] {
*self.0.all_bytes()
}
#[inline]
/// A helper function for displaying as a `&str`.
pub const fn as_str(&self) -> &str {
self.0.as_str()
}
#[doc(hidden)]
pub const fn to_tinystr(&self) -> tinystr::TinyAsciiStr<$len_end> {
self.0
}
/// Compare with BCP-47 bytes.
///
/// The return value is equivalent to what would happen if you first converted
/// `self` to a BCP-47 string and then performed a byte comparison.
///
/// This function is case-sensitive and results in a *total order*, so it is appropriate for
/// binary search. The only argument producing [`Ordering::Equal`](core::cmp::Ordering::Equal)
/// is `self.as_str().as_bytes()`.
#[inline]
pub fn strict_cmp(self, other: &[u8]) -> core::cmp::Ordering {
self.as_str().as_bytes().cmp(other)
}
/// Compare with a potentially unnormalized BCP-47 string.
///
/// The return value is equivalent to what would happen if you first parsed the
/// BCP-47 string and then performed a structural comparison.
///
#[inline]
pub fn normalizing_eq(self, other: &str) -> bool {
self.as_str().eq_ignore_ascii_case(other)
}
}
impl core::str::FromStr for $name {
type Err = crate::parser::errors::ParseError;
#[inline]
fn from_str(s: &str) -> Result<Self, Self::Err> {
Self::try_from_str(s)
}
}
impl<'l> From<&'l $name> for &'l str {
fn from(input: &'l $name) -> Self {
input.as_str()
}
}
impl From<$name> for tinystr::TinyAsciiStr<$len_end> {
fn from(input: $name) -> Self {
input.to_tinystr()
}
}
impl writeable::Writeable for $name {
#[inline]
fn write_to<W: core::fmt::Write + ?Sized>(&self, sink: &mut W) -> core::fmt::Result {
sink.write_str(self.as_str())
}
#[inline]
fn writeable_length_hint(&self) -> writeable::LengthHint {
writeable::LengthHint::exact(self.0.len())
}
fn writeable_borrow(&self) -> Option<&str> {
Some(self.0.as_str())
}
}
writeable::impl_display_with_writeable!($name, #[cfg(feature = "alloc")]);
#[doc = concat!("A macro allowing for compile-time construction of valid [`", stringify!($name), "`] subtags.")]
///
/// # Examples
///
/// Parsing errors don't have to be handled at runtime:
/// ```
/// assert_eq!(
#[doc = concat!(" icu_locale_core::", $(stringify!($path), "::",)+ stringify!($macro_name), "!(", stringify!($good_example) ,"),")]
#[doc = concat!(" ", stringify!($good_example), ".parse::<icu_locale_core::", $(stringify!($path), "::",)+ stringify!($name), ">().unwrap()")]
/// );
/// ```
///
/// Invalid input is a compile failure:
/// ```compile_fail,E0080
#[doc = concat!("icu_locale_core::", $(stringify!($path), "::",)+ stringify!($macro_name), "!(", stringify!($bad_example) ,");")]
/// ```
///
#[doc = concat!("[`", stringify!($name), "`]: crate::", $(stringify!($path), "::",)+ stringify!($name))]
#[macro_export]
#[doc(hidden)] // macro
macro_rules! $internal_macro_name {
($string:literal) => { const {
use $crate::$($path ::)+ $name;
match $name::try_from_utf8($string.as_bytes()) {
Ok(r) => r,
_ => panic!(concat!("Invalid ", $(stringify!($path), "::",)+ stringify!($name), ": ", $string)),
}
}};
}
#[doc(inline)]
pub use $internal_macro_name as $macro_name;
#[cfg(feature = "databake")]
impl databake::Bake for $name {
fn bake(&self, env: &databake::CrateEnv) -> databake::TokenStream {
env.insert("icu_locale_core");
let string = self.as_str();
databake::quote! { icu_locale_core::$($path::)+ $macro_name!(#string) }
}
}
#[cfg(feature = "databake")]
impl databake::BakeSize for $name {
fn borrows_size(&self) -> usize {
0
}
}
#[test]
fn test_construction() {
let maybe = $name::try_from_utf8($good_example.as_bytes());
assert!(maybe.is_ok());
assert_eq!(maybe, $name::try_from_raw(maybe.unwrap().into_raw()));
assert_eq!(maybe.unwrap().as_str(), $good_example);
$(
let maybe = $name::try_from_utf8($more_good_examples.as_bytes());
assert!(maybe.is_ok());
assert_eq!(maybe, $name::try_from_raw(maybe.unwrap().into_raw()));
assert_eq!(maybe.unwrap().as_str(), $more_good_examples);
)*
assert!($name::try_from_utf8($bad_example.as_bytes()).is_err());
$(
assert!($name::try_from_utf8($more_bad_examples.as_bytes()).is_err());
)*
}
#[test]
fn test_writeable() {
writeable::assert_writeable_eq!(&$good_example.parse::<$name>().unwrap(), $good_example);
$(
writeable::assert_writeable_eq!($more_good_examples.parse::<$name>().unwrap(), $more_good_examples);
)*
}
#[cfg(feature = "serde")]
impl serde::Serialize for $name {
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
where
S: serde::Serializer,
{
self.0.serialize(serializer)
}
}
#[cfg(feature = "serde")]
impl<'de> serde::Deserialize<'de> for $name {
fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
where
D: serde::de::Deserializer<'de>,
{
struct Visitor;
impl<'de> serde::de::Visitor<'de> for Visitor {
type Value = $name;
fn expecting(
&self,
formatter: &mut core::fmt::Formatter<'_>,
) -> core::fmt::Result {
write!(formatter, "a valid BCP-47 {}", stringify!($name))
}
fn visit_str<E: serde::de::Error>(self, s: &str) -> Result<Self::Value, E> {
s.parse().map_err(serde::de::Error::custom)
}
}
if deserializer.is_human_readable() {
deserializer.deserialize_string(Visitor)
} else {
Self::try_from_raw(serde::de::Deserialize::deserialize(deserializer)?)
.map_err(serde::de::Error::custom)
}
}
}
// Safety checklist for ULE:
//
// 1. Must not include any uninitialized or padding bytes (true since transparent over a ULE).
// 2. Must have an alignment of 1 byte (true since transparent over a ULE).
// 3. ULE::validate_bytes() checks that the given byte slice represents a valid slice.
// 4. ULE::validate_bytes() checks that the given byte slice has a valid length.
// 5. All other methods must be left with their default impl.
// 6. Byte equality is semantic equality.
#[cfg(feature = "zerovec")]
unsafe impl zerovec::ule::ULE for $name {
fn validate_bytes(bytes: &[u8]) -> Result<(), zerovec::ule::UleError> {
let it = bytes.chunks_exact(core::mem::size_of::<Self>());
if !it.remainder().is_empty() {
return Err(zerovec::ule::UleError::length::<Self>(bytes.len()));
}
for v in it {
// The following can be removed once `array_chunks` is stabilized.
let mut a = [0; core::mem::size_of::<Self>()];
a.copy_from_slice(v);
if Self::try_from_raw(a).is_err() {
return Err(zerovec::ule::UleError::parse::<Self>());
}
}
Ok(())
}
}
#[cfg(feature = "zerovec")]
impl zerovec::ule::NicheBytes<$len_end> for $name {
const NICHE_BIT_PATTERN: [u8; $len_end] = <tinystr::TinyAsciiStr<$len_end>>::NICHE_BIT_PATTERN;
}
#[cfg(feature = "zerovec")]
impl zerovec::ule::AsULE for $name {
type ULE = Self;
fn to_unaligned(self) -> Self::ULE {
self
}
fn from_unaligned(unaligned: Self::ULE) -> Self {
unaligned
}
}
#[cfg(feature = "zerovec")]
#[cfg(feature = "alloc")]
impl<'a> zerovec::maps::ZeroMapKV<'a> for $name {
type Container = zerovec::ZeroVec<'a, $name>;
type Slice = zerovec::ZeroSlice<$name>;
type GetType = $name;
type OwnedType = $name;
}
};
}
#[macro_export]
#[doc(hidden)]
macro_rules! impl_writeable_for_each_subtag_str_no_test {
($type:tt $(, $self:ident, $borrow_cond:expr => $borrow:expr)?) => {
impl writeable::Writeable for $type {
fn write_to<W: core::fmt::Write + ?Sized>(&self, sink: &mut W) -> core::fmt::Result {
let mut initial = true;
self.for_each_subtag_str(&mut |subtag| {
if initial {
initial = false;
} else {
sink.write_char('-')?;
}
sink.write_str(subtag)
})
}
#[inline]
fn writeable_length_hint(&self) -> writeable::LengthHint {
let mut result = writeable::LengthHint::exact(0);
let mut initial = true;
self.for_each_subtag_str::<core::convert::Infallible, _>(&mut |subtag| {
if initial {
initial = false;
} else {
result += 1;
}
result += subtag.len();
Ok(())
})
.expect("infallible");
result
}
$(
fn writeable_borrow(&self) -> Option<&str> {
let $self = self;
if $borrow_cond {
$borrow
} else {
None
}
}
)?
}
writeable::impl_display_with_writeable!($type, #[cfg(feature = "alloc")]);
};
}
macro_rules! impl_writeable_for_subtag_list {
($type:tt, $sample1:literal, $sample2:literal) => {
impl_writeable_for_each_subtag_str_no_test!($type, selff, selff.0.len() == 1 => #[allow(clippy::unwrap_used)] { Some(selff.0.get(0).unwrap().as_str()) } );
#[test]
fn test_writeable() {
writeable::assert_writeable_eq!(&$type::default(), "");
writeable::assert_writeable_eq!(
&$type::from_vec_unchecked(alloc::vec![$sample1.parse().unwrap()]),
$sample1,
);
writeable::assert_writeable_eq!(
&$type::from_vec_unchecked(vec![
$sample1.parse().unwrap(),
$sample2.parse().unwrap()
]),
core::concat!($sample1, "-", $sample2),
);
}
};
}
macro_rules! impl_writeable_for_key_value {
($type:tt, $key1:literal, $value1:literal, $key2:literal, $expected2:literal) => {
impl_writeable_for_each_subtag_str_no_test!($type);
#[test]
fn test_writeable() {
writeable::assert_writeable_eq!(&$type::default(), "");
writeable::assert_writeable_eq!(
&$type::from_tuple_vec(vec![($key1.parse().unwrap(), $value1.parse().unwrap())]),
core::concat!($key1, "-", $value1),
);
writeable::assert_writeable_eq!(
&$type::from_tuple_vec(vec![
($key1.parse().unwrap(), $value1.parse().unwrap()),
($key2.parse().unwrap(), "true".parse().unwrap())
]),
core::concat!($key1, "-", $value1, "-", $expected2),
);
}
};
}

681
vendor/icu_locale_core/src/langid.rs vendored Normal file
View File

@@ -0,0 +1,681 @@
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
use core::cmp::Ordering;
#[cfg(feature = "alloc")]
use core::str::FromStr;
use crate::parser;
use crate::subtags;
use crate::ParseError;
#[cfg(feature = "alloc")]
use alloc::borrow::Cow;
/// A core struct representing a [`Unicode BCP47 Language Identifier`].
///
/// # Ordering
///
/// This type deliberately does not implement `Ord` or `PartialOrd` because there are
/// multiple possible orderings. Depending on your use case, two orderings are available:
///
/// 1. A string ordering, suitable for stable serialization: [`LanguageIdentifier::strict_cmp`]
/// 2. A struct ordering, suitable for use with a BTreeSet: [`LanguageIdentifier::total_cmp`]
///
/// See issue: <https://github.com/unicode-org/icu4x/issues/1215>
///
/// # Parsing
///
/// Unicode recognizes three levels of standard conformance for any language identifier:
///
/// * *well-formed* - syntactically correct
/// * *valid* - well-formed and only uses registered language, region, script and variant subtags...
/// * *canonical* - valid and no deprecated codes or structure.
///
/// At the moment parsing normalizes a well-formed language identifier converting
/// `_` separators to `-` and adjusting casing to conform to the Unicode standard.
///
/// Any syntactically invalid subtags will cause the parsing to fail with an error.
///
/// This operation normalizes syntax to be well-formed. No legacy subtag replacements is performed.
/// For validation and canonicalization, see `LocaleCanonicalizer`.
///
/// # Serde
///
/// This type implements `serde::Serialize` and `serde::Deserialize` if the
/// `"serde"` Cargo feature is enabled on the crate.
///
/// The value will be serialized as a string and parsed when deserialized.
/// For tips on efficient storage and retrieval of locales, see [`crate::zerovec`].
///
/// # Examples
///
/// Simple example:
///
/// ```
/// use icu::locale::{
/// langid,
/// subtags::{language, region},
/// };
///
/// let li = langid!("en-US");
///
/// assert_eq!(li.language, language!("en"));
/// assert_eq!(li.script, None);
/// assert_eq!(li.region, Some(region!("US")));
/// assert_eq!(li.variants.len(), 0);
/// ```
///
/// More complex example:
///
/// ```
/// use icu::locale::{
/// langid,
/// subtags::{language, region, script, variant},
/// };
///
/// let li = langid!("eN-latn-Us-Valencia");
///
/// assert_eq!(li.language, language!("en"));
/// assert_eq!(li.script, Some(script!("Latn")));
/// assert_eq!(li.region, Some(region!("US")));
/// assert_eq!(li.variants.first(), Some(&variant!("valencia")));
/// ```
///
/// [`Unicode BCP47 Language Identifier`]: https://unicode.org/reports/tr35/tr35.html#Unicode_language_identifier
#[derive(PartialEq, Eq, Clone, Hash)] // no Ord or PartialOrd: see docs
#[allow(clippy::exhaustive_structs)] // This struct is stable (and invoked by a macro)
pub struct LanguageIdentifier {
/// Language subtag of the language identifier.
pub language: subtags::Language,
/// Script subtag of the language identifier.
pub script: Option<subtags::Script>,
/// Region subtag of the language identifier.
pub region: Option<subtags::Region>,
/// Variant subtags of the language identifier.
pub variants: subtags::Variants,
}
impl LanguageIdentifier {
/// The unknown language identifier "und".
pub const UNKNOWN: Self = crate::langid!("und");
/// A constructor which takes a utf8 slice, parses it and
/// produces a well-formed [`LanguageIdentifier`].
///
/// ✨ *Enabled with the `alloc` Cargo feature.*
///
/// # Examples
///
/// ```
/// use icu::locale::LanguageIdentifier;
///
/// LanguageIdentifier::try_from_str("en-US").expect("Parsing failed");
/// ```
#[inline]
#[cfg(feature = "alloc")]
pub fn try_from_str(s: &str) -> Result<Self, ParseError> {
Self::try_from_utf8(s.as_bytes())
}
/// See [`Self::try_from_str`]
///
/// ✨ *Enabled with the `alloc` Cargo feature.*
#[cfg(feature = "alloc")]
pub fn try_from_utf8(code_units: &[u8]) -> Result<Self, ParseError> {
crate::parser::parse_language_identifier(code_units, parser::ParserMode::LanguageIdentifier)
}
#[doc(hidden)] // macro use
#[expect(clippy::type_complexity)]
// The return type should be `Result<Self, ParseError>` once the `const_precise_live_drops`
// is stabilized ([rust-lang#73255](https://github.com/rust-lang/rust/issues/73255)).
pub const fn try_from_utf8_with_single_variant(
code_units: &[u8],
) -> Result<
(
subtags::Language,
Option<subtags::Script>,
Option<subtags::Region>,
Option<subtags::Variant>,
),
ParseError,
> {
crate::parser::parse_language_identifier_with_single_variant(
code_units,
parser::ParserMode::LanguageIdentifier,
)
}
/// A constructor which takes a utf8 slice which may contain extension keys,
/// parses it and produces a well-formed [`LanguageIdentifier`].
///
/// ✨ *Enabled with the `alloc` Cargo feature.*
///
/// # Examples
///
/// ```
/// use icu::locale::{langid, LanguageIdentifier};
///
/// let li = LanguageIdentifier::try_from_locale_bytes(b"en-US-x-posix")
/// .expect("Parsing failed.");
///
/// assert_eq!(li, langid!("en-US"));
/// ```
///
/// This method should be used for input that may be a locale identifier.
/// All extensions will be lost.
#[cfg(feature = "alloc")]
pub fn try_from_locale_bytes(v: &[u8]) -> Result<Self, ParseError> {
parser::parse_language_identifier(v, parser::ParserMode::Locale)
}
/// Whether this [`LanguageIdentifier`] equals [`LanguageIdentifier::UNKNOWN`].
pub const fn is_unknown(&self) -> bool {
self.language.is_unknown()
&& self.script.is_none()
&& self.region.is_none()
&& self.variants.is_empty()
}
/// Normalize the language identifier (operating on UTF-8 formatted byte slices)
///
/// This operation will normalize casing and the separator.
///
/// ✨ *Enabled with the `alloc` Cargo feature.*
///
/// # Examples
///
/// ```
/// use icu::locale::LanguageIdentifier;
///
/// assert_eq!(
/// LanguageIdentifier::normalize("pL-latn-pl").as_deref(),
/// Ok("pl-Latn-PL")
/// );
/// ```
#[cfg(feature = "alloc")]
pub fn normalize_utf8(input: &[u8]) -> Result<Cow<'_, str>, ParseError> {
let lang_id = Self::try_from_utf8(input)?;
Ok(writeable::to_string_or_borrow(&lang_id, input))
}
/// Normalize the language identifier (operating on strings)
///
/// This operation will normalize casing and the separator.
///
/// ✨ *Enabled with the `alloc` Cargo feature.*
///
/// # Examples
///
/// ```
/// use icu::locale::LanguageIdentifier;
///
/// assert_eq!(
/// LanguageIdentifier::normalize("pL-latn-pl").as_deref(),
/// Ok("pl-Latn-PL")
/// );
/// ```
#[cfg(feature = "alloc")]
pub fn normalize(input: &str) -> Result<Cow<'_, str>, ParseError> {
Self::normalize_utf8(input.as_bytes())
}
/// Compare this [`LanguageIdentifier`] with BCP-47 bytes.
///
/// The return value is equivalent to what would happen if you first converted this
/// [`LanguageIdentifier`] to a BCP-47 string and then performed a byte comparison.
///
/// This function is case-sensitive and results in a *total order*, so it is appropriate for
/// binary search. The only argument producing [`Ordering::Equal`] is `self.to_string()`.
///
/// # Examples
///
/// Sorting a list of langids with this method requires converting one of them to a string:
///
/// ```
/// use icu::locale::LanguageIdentifier;
/// use std::cmp::Ordering;
/// use writeable::Writeable;
///
/// // Random input order:
/// let bcp47_strings: &[&str] = &[
/// "ar-Latn",
/// "zh-Hant-TW",
/// "zh-TW",
/// "und-fonipa",
/// "zh-Hant",
/// "ar-SA",
/// ];
///
/// let mut langids = bcp47_strings
/// .iter()
/// .map(|s| s.parse().unwrap())
/// .collect::<Vec<LanguageIdentifier>>();
/// langids.sort_by(|a, b| {
/// let b = b.write_to_string();
/// a.strict_cmp(b.as_bytes())
/// });
/// let strict_cmp_strings = langids
/// .iter()
/// .map(|l| l.to_string())
/// .collect::<Vec<String>>();
///
/// // Output ordering, sorted alphabetically
/// let expected_ordering: &[&str] = &[
/// "ar-Latn",
/// "ar-SA",
/// "und-fonipa",
/// "zh-Hant",
/// "zh-Hant-TW",
/// "zh-TW",
/// ];
///
/// assert_eq!(expected_ordering, strict_cmp_strings);
/// ```
pub fn strict_cmp(&self, other: &[u8]) -> Ordering {
writeable::cmp_utf8(self, other)
}
pub(crate) fn as_tuple(
&self,
) -> (
subtags::Language,
Option<subtags::Script>,
Option<subtags::Region>,
&subtags::Variants,
) {
(self.language, self.script, self.region, &self.variants)
}
/// Compare this [`LanguageIdentifier`] with another [`LanguageIdentifier`] field-by-field.
/// The result is a total ordering sufficient for use in a [`BTreeSet`].
///
/// Unlike [`LanguageIdentifier::strict_cmp`], the ordering may or may not be equivalent
/// to string ordering, and it may or may not be stable across ICU4X releases.
///
/// # Examples
///
/// This method returns a nonsensical ordering derived from the fields of the struct:
///
/// ```
/// use icu::locale::LanguageIdentifier;
/// use std::cmp::Ordering;
///
/// // Input strings, sorted alphabetically
/// let bcp47_strings: &[&str] = &[
/// "ar-Latn",
/// "ar-SA",
/// "und-fonipa",
/// "zh-Hant",
/// "zh-Hant-TW",
/// "zh-TW",
/// ];
/// assert!(bcp47_strings.windows(2).all(|w| w[0] < w[1]));
///
/// let mut langids = bcp47_strings
/// .iter()
/// .map(|s| s.parse().unwrap())
/// .collect::<Vec<LanguageIdentifier>>();
/// langids.sort_by(LanguageIdentifier::total_cmp);
/// let total_cmp_strings = langids
/// .iter()
/// .map(|l| l.to_string())
/// .collect::<Vec<String>>();
///
/// // Output ordering, sorted arbitrarily
/// let expected_ordering: &[&str] = &[
/// "ar-SA",
/// "ar-Latn",
/// "und-fonipa",
/// "zh-TW",
/// "zh-Hant",
/// "zh-Hant-TW",
/// ];
///
/// assert_eq!(expected_ordering, total_cmp_strings);
/// ```
///
/// Use a wrapper to add a [`LanguageIdentifier`] to a [`BTreeSet`]:
///
/// ```no_run
/// use icu::locale::LanguageIdentifier;
/// use std::cmp::Ordering;
/// use std::collections::BTreeSet;
///
/// #[derive(PartialEq, Eq)]
/// struct LanguageIdentifierTotalOrd(LanguageIdentifier);
///
/// impl Ord for LanguageIdentifierTotalOrd {
/// fn cmp(&self, other: &Self) -> Ordering {
/// self.0.total_cmp(&other.0)
/// }
/// }
///
/// impl PartialOrd for LanguageIdentifierTotalOrd {
/// fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
/// Some(self.cmp(other))
/// }
/// }
///
/// let _: BTreeSet<LanguageIdentifierTotalOrd> = unimplemented!();
/// ```
///
/// [`BTreeSet`]: alloc::collections::BTreeSet
pub fn total_cmp(&self, other: &Self) -> Ordering {
self.as_tuple().cmp(&other.as_tuple())
}
/// Compare this `LanguageIdentifier` with a potentially unnormalized BCP-47 string.
///
/// The return value is equivalent to what would happen if you first parsed the
/// BCP-47 string to a `LanguageIdentifier` and then performed a structural comparison.
///
/// # Examples
///
/// ```
/// use icu::locale::LanguageIdentifier;
///
/// let bcp47_strings: &[&str] = &[
/// "pl-LaTn-pL",
/// "uNd",
/// "UnD-adlm",
/// "uNd-GB",
/// "UND-FONIPA",
/// "ZH",
/// ];
///
/// for a in bcp47_strings {
/// assert!(a.parse::<LanguageIdentifier>().unwrap().normalizing_eq(a));
/// }
/// ```
pub fn normalizing_eq(&self, other: &str) -> bool {
macro_rules! subtag_matches {
($T:ty, $iter:ident, $expected:expr) => {
$iter
.next()
.map(|b| <$T>::try_from_utf8(b) == Ok($expected))
.unwrap_or(false)
};
}
let mut iter = parser::SubtagIterator::new(other.as_bytes());
if !subtag_matches!(subtags::Language, iter, self.language) {
return false;
}
if let Some(ref script) = self.script {
if !subtag_matches!(subtags::Script, iter, *script) {
return false;
}
}
if let Some(ref region) = self.region {
if !subtag_matches!(subtags::Region, iter, *region) {
return false;
}
}
for variant in self.variants.iter() {
if !subtag_matches!(subtags::Variant, iter, *variant) {
return false;
}
}
iter.next().is_none()
}
pub(crate) fn for_each_subtag_str<E, F>(&self, f: &mut F) -> Result<(), E>
where
F: FnMut(&str) -> Result<(), E>,
{
f(self.language.as_str())?;
if let Some(ref script) = self.script {
f(script.as_str())?;
}
if let Some(ref region) = self.region {
f(region.as_str())?;
}
for variant in self.variants.iter() {
f(variant.as_str())?;
}
Ok(())
}
/// Executes `f` on each subtag string of this `LanguageIdentifier`, with every string in
/// lowercase ascii form.
///
/// The default normalization of language identifiers uses titlecase scripts and uppercase
/// regions. However, this differs from [RFC6497 (BCP 47 Extension T)], which specifies:
///
/// > _The canonical form for all subtags in the extension is lowercase, with the fields
/// > ordered by the separators, alphabetically._
///
/// Hence, this method is used inside [`Transform Extensions`] to be able to get the correct
/// normalization of the language identifier.
///
/// As an example, the canonical form of locale **EN-LATN-CA-T-EN-LATN-CA** is
/// **en-Latn-CA-t-en-latn-ca**, with the script and region parts lowercased inside T extensions,
/// but titlecased and uppercased outside T extensions respectively.
///
/// [RFC6497 (BCP 47 Extension T)]: https://www.ietf.org/rfc/rfc6497.txt
/// [`Transform extensions`]: crate::extensions::transform
pub(crate) fn for_each_subtag_str_lowercased<E, F>(&self, f: &mut F) -> Result<(), E>
where
F: FnMut(&str) -> Result<(), E>,
{
f(self.language.as_str())?;
if let Some(ref script) = self.script {
f(script.to_tinystr().to_ascii_lowercase().as_str())?;
}
if let Some(ref region) = self.region {
f(region.to_tinystr().to_ascii_lowercase().as_str())?;
}
for variant in self.variants.iter() {
f(variant.as_str())?;
}
Ok(())
}
/// Writes this `LanguageIdentifier` to a sink, replacing uppercase ascii chars with
/// lowercase ascii chars.
///
/// The default normalization of language identifiers uses titlecase scripts and uppercase
/// regions. However, this differs from [RFC6497 (BCP 47 Extension T)], which specifies:
///
/// > _The canonical form for all subtags in the extension is lowercase, with the fields
/// > ordered by the separators, alphabetically._
///
/// Hence, this method is used inside [`Transform Extensions`] to be able to get the correct
/// normalization of the language identifier.
///
/// As an example, the canonical form of locale **EN-LATN-CA-T-EN-LATN-CA** is
/// **en-Latn-CA-t-en-latn-ca**, with the script and region parts lowercased inside T extensions,
/// but titlecased and uppercased outside T extensions respectively.
///
/// [RFC6497 (BCP 47 Extension T)]: https://www.ietf.org/rfc/rfc6497.txt
/// [`Transform extensions`]: crate::extensions::transform
pub(crate) fn write_lowercased_to<W: core::fmt::Write + ?Sized>(
&self,
sink: &mut W,
) -> core::fmt::Result {
let mut initial = true;
self.for_each_subtag_str_lowercased(&mut |subtag| {
if initial {
initial = false;
} else {
sink.write_char('-')?;
}
sink.write_str(subtag)
})
}
}
impl core::fmt::Debug for LanguageIdentifier {
fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
core::fmt::Display::fmt(&self, f)
}
}
/// ✨ *Enabled with the `alloc` Cargo feature.*
#[cfg(feature = "alloc")]
impl FromStr for LanguageIdentifier {
type Err = ParseError;
#[inline]
fn from_str(s: &str) -> Result<Self, Self::Err> {
Self::try_from_str(s)
}
}
impl_writeable_for_each_subtag_str_no_test!(LanguageIdentifier, selff, selff.script.is_none() && selff.region.is_none() && selff.variants.is_empty() => Some(selff.language.as_str()));
#[test]
fn test_writeable() {
use writeable::assert_writeable_eq;
assert_writeable_eq!(LanguageIdentifier::UNKNOWN, "und");
assert_writeable_eq!("und-001".parse::<LanguageIdentifier>().unwrap(), "und-001");
assert_writeable_eq!(
"und-Mymr".parse::<LanguageIdentifier>().unwrap(),
"und-Mymr",
);
assert_writeable_eq!(
"my-Mymr-MM".parse::<LanguageIdentifier>().unwrap(),
"my-Mymr-MM",
);
assert_writeable_eq!(
"my-Mymr-MM-posix".parse::<LanguageIdentifier>().unwrap(),
"my-Mymr-MM-posix",
);
assert_writeable_eq!(
"zh-macos-posix".parse::<LanguageIdentifier>().unwrap(),
"zh-macos-posix",
);
}
/// # Examples
///
/// ```
/// use icu::locale::{langid, subtags::language, LanguageIdentifier};
///
/// assert_eq!(LanguageIdentifier::from(language!("en")), langid!("en"));
/// ```
impl From<subtags::Language> for LanguageIdentifier {
fn from(language: subtags::Language) -> Self {
Self {
language,
script: None,
region: None,
variants: subtags::Variants::new(),
}
}
}
/// # Examples
///
/// ```
/// use icu::locale::{langid, subtags::script, LanguageIdentifier};
///
/// assert_eq!(
/// LanguageIdentifier::from(Some(script!("latn"))),
/// langid!("und-Latn")
/// );
/// ```
impl From<Option<subtags::Script>> for LanguageIdentifier {
fn from(script: Option<subtags::Script>) -> Self {
Self {
language: subtags::Language::UNKNOWN,
script,
region: None,
variants: subtags::Variants::new(),
}
}
}
/// # Examples
///
/// ```
/// use icu::locale::{langid, subtags::region, LanguageIdentifier};
///
/// assert_eq!(
/// LanguageIdentifier::from(Some(region!("US"))),
/// langid!("und-US")
/// );
/// ```
impl From<Option<subtags::Region>> for LanguageIdentifier {
fn from(region: Option<subtags::Region>) -> Self {
Self {
language: subtags::Language::UNKNOWN,
script: None,
region,
variants: subtags::Variants::new(),
}
}
}
/// Convert from an LSR tuple to a [`LanguageIdentifier`].
///
/// # Examples
///
/// ```
/// use icu::locale::{
/// langid,
/// subtags::{language, region, script},
/// LanguageIdentifier,
/// };
///
/// let lang = language!("en");
/// let script = script!("Latn");
/// let region = region!("US");
/// assert_eq!(
/// LanguageIdentifier::from((lang, Some(script), Some(region))),
/// langid!("en-Latn-US")
/// );
/// ```
impl
From<(
subtags::Language,
Option<subtags::Script>,
Option<subtags::Region>,
)> for LanguageIdentifier
{
fn from(
lsr: (
subtags::Language,
Option<subtags::Script>,
Option<subtags::Region>,
),
) -> Self {
Self {
language: lsr.0,
script: lsr.1,
region: lsr.2,
variants: subtags::Variants::new(),
}
}
}
/// Convert from a [`LanguageIdentifier`] to an LSR tuple.
///
/// # Examples
///
/// ```
/// use icu::locale::{
/// langid,
/// subtags::{language, region, script},
/// };
///
/// let lid = langid!("en-Latn-US");
/// let (lang, script, region) = (&lid).into();
///
/// assert_eq!(lang, language!("en"));
/// assert_eq!(script, Some(script!("Latn")));
/// assert_eq!(region, Some(region!("US")));
/// ```
impl From<&LanguageIdentifier>
for (
subtags::Language,
Option<subtags::Script>,
Option<subtags::Region>,
)
{
fn from(langid: &LanguageIdentifier) -> Self {
(langid.language, langid.script, langid.region)
}
}

96
vendor/icu_locale_core/src/lib.rs vendored Normal file
View File

@@ -0,0 +1,96 @@
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
//! Parsing, manipulating, and serializing Unicode Language and Locale Identifiers.
//!
//! This module is published as its own crate ([`icu_locale_core`](https://docs.rs/icu_locale_core/latest/icu_locale_core/))
//! and as part of the [`icu`](https://docs.rs/icu/latest/icu/) crate. See the latter for more details on the ICU4X project.
//!
//! The module provides algorithms for parsing a string into a well-formed language or locale identifier
//! as defined by [`UTS #35: Unicode LDML 3. Unicode Language and Locale Identifiers`]. Additionally
//! the module provides [`preferences`] interface for operations on locale preferences and conversions
//! from and to locale unicode extensions.
//!
//! [`Locale`] is the most common structure to use for storing information about a language,
//! script, region, variants and extensions. In almost all cases, this struct should be used as the
//! base unit for all locale management operations.
//!
//! [`LanguageIdentifier`] is a strict subset of [`Locale`] which can be useful in a narrow range of
//! cases where [`Unicode Extensions`] are not relevant.
//!
//! If in doubt, use [`Locale`].
//!
//! # Examples
//!
//! ```
//! use icu::locale::Locale;
//! use icu::locale::{
//! locale,
//! subtags::{language, region},
//! };
//!
//! let mut loc: Locale = locale!("en-US");
//!
//! assert_eq!(loc.id.language, language!("en"));
//! assert_eq!(loc.id.script, None);
//! assert_eq!(loc.id.region, Some(region!("US")));
//! assert_eq!(loc.id.variants.len(), 0);
//!
//! loc.id.region = Some(region!("GB"));
//!
//! assert_eq!(loc, locale!("en-GB"));
//! ```
//!
//! For more details, see [`Locale`] and [`LanguageIdentifier`].
//!
//! [`UTS #35: Unicode LDML 3. Unicode Language and Locale Identifiers`]: https://unicode.org/reports/tr35/tr35.html#Unicode_Language_and_Locale_Identifiers
//! [`ICU4X`]: ../icu/index.html
//! [`Unicode Extensions`]: extensions
// https://github.com/unicode-org/icu4x/blob/main/documents/process/boilerplate.md#library-annotations
#![cfg_attr(not(any(test, doc)), no_std)]
#![cfg_attr(
not(test),
deny(
clippy::indexing_slicing,
clippy::unwrap_used,
clippy::expect_used,
clippy::panic,
clippy::exhaustive_structs,
clippy::exhaustive_enums,
clippy::trivially_copy_pass_by_ref,
missing_debug_implementations,
)
)]
#![warn(missing_docs)]
#[cfg(feature = "alloc")]
extern crate alloc;
#[macro_use]
mod helpers;
mod data;
mod langid;
mod locale;
mod macros;
mod parser;
mod shortvec;
pub use data::DataLocale;
pub use langid::LanguageIdentifier;
pub use locale::Locale;
pub use parser::ParseError;
pub mod extensions;
#[macro_use]
pub mod subtags;
pub mod preferences;
pub mod zerovec;
#[cfg(all(feature = "alloc", feature = "serde"))]
mod serde;
#[cfg(feature = "databake")]
mod databake;

626
vendor/icu_locale_core/src/locale.rs vendored Normal file
View File

@@ -0,0 +1,626 @@
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
use crate::parser::*;
use crate::subtags::Subtag;
use crate::{extensions, subtags, LanguageIdentifier};
#[cfg(feature = "alloc")]
use alloc::borrow::Cow;
use core::cmp::Ordering;
#[cfg(feature = "alloc")]
use core::str::FromStr;
/// A core struct representing a [`Unicode Locale Identifier`].
///
/// A locale is made of two parts:
/// * Unicode Language Identifier
/// * A set of Unicode Extensions
///
/// [`Locale`] exposes all of the same fields and methods as [`LanguageIdentifier`], and
/// on top of that is able to parse, manipulate and serialize unicode extension fields.
///
/// # Ordering
///
/// This type deliberately does not implement `Ord` or `PartialOrd` because there are
/// multiple possible orderings. Depending on your use case, two orderings are available:
///
/// 1. A string ordering, suitable for stable serialization: [`Locale::strict_cmp`]
/// 2. A struct ordering, suitable for use with a BTreeSet: [`Locale::total_cmp`]
///
/// See issue: <https://github.com/unicode-org/icu4x/issues/1215>
///
/// # Parsing
///
/// Unicode recognizes three levels of standard conformance for a locale:
///
/// * *well-formed* - syntactically correct
/// * *valid* - well-formed and only uses registered language subtags, extensions, keywords, types...
/// * *canonical* - valid and no deprecated codes or structure.
///
/// Any syntactically invalid subtags will cause the parsing to fail with an error.
///
/// This operation normalizes syntax to be well-formed. No legacy subtag replacements is performed.
/// For validation and canonicalization, see `LocaleCanonicalizer`.
///
/// ICU4X's Locale parsing does not allow for non-BCP-47-compatible locales [allowed by UTS 35 for backwards compatability][tr35-bcp].
/// Furthermore, it currently does not allow for language tags to have more than three characters.
///
/// # Serde
///
/// This type implements `serde::Serialize` and `serde::Deserialize` if the
/// `"serde"` Cargo feature is enabled on the crate.
///
/// The value will be serialized as a string and parsed when deserialized.
/// For tips on efficient storage and retrieval of locales, see [`crate::zerovec`].
///
/// # Examples
///
/// Simple example:
///
/// ```
/// use icu::locale::{
/// extensions::unicode::{key, value},
/// locale,
/// subtags::{language, region},
/// };
///
/// let loc = locale!("en-US-u-ca-buddhist");
///
/// assert_eq!(loc.id.language, language!("en"));
/// assert_eq!(loc.id.script, None);
/// assert_eq!(loc.id.region, Some(region!("US")));
/// assert_eq!(loc.id.variants.len(), 0);
/// assert_eq!(
/// loc.extensions.unicode.keywords.get(&key!("ca")),
/// Some(&value!("buddhist"))
/// );
/// ```
///
/// More complex example:
///
/// ```
/// use icu::locale::{subtags::*, Locale};
///
/// let loc: Locale = "eN-latn-Us-Valencia-u-hC-H12"
/// .parse()
/// .expect("Failed to parse.");
///
/// assert_eq!(loc.id.language, "en".parse::<Language>().unwrap());
/// assert_eq!(loc.id.script, "Latn".parse::<Script>().ok());
/// assert_eq!(loc.id.region, "US".parse::<Region>().ok());
/// assert_eq!(
/// loc.id.variants.first(),
/// "valencia".parse::<Variant>().ok().as_ref()
/// );
/// ```
///
/// [`Unicode Locale Identifier`]: https://unicode.org/reports/tr35/tr35.html#Unicode_locale_identifier
/// [tr35-bcp]: https://unicode.org/reports/tr35/#BCP_47_Conformance
#[derive(PartialEq, Eq, Clone, Hash)] // no Ord or PartialOrd: see docs
#[allow(clippy::exhaustive_structs)] // This struct is stable (and invoked by a macro)
pub struct Locale {
/// The basic language/script/region components in the locale identifier along with any variants.
pub id: LanguageIdentifier,
/// Any extensions present in the locale identifier.
pub extensions: extensions::Extensions,
}
#[test]
// Expected sizes are based on a 64-bit architecture
#[cfg(target_pointer_width = "64")]
fn test_sizes() {
assert_eq!(core::mem::size_of::<subtags::Language>(), 3);
assert_eq!(core::mem::size_of::<subtags::Script>(), 4);
assert_eq!(core::mem::size_of::<subtags::Region>(), 3);
assert_eq!(core::mem::size_of::<subtags::Variant>(), 8);
assert_eq!(core::mem::size_of::<subtags::Variants>(), 16);
assert_eq!(core::mem::size_of::<LanguageIdentifier>(), 32);
assert_eq!(core::mem::size_of::<extensions::transform::Transform>(), 56);
assert_eq!(core::mem::size_of::<Option<LanguageIdentifier>>(), 32);
assert_eq!(core::mem::size_of::<extensions::transform::Fields>(), 24);
assert_eq!(core::mem::size_of::<extensions::unicode::Attributes>(), 16);
assert_eq!(core::mem::size_of::<extensions::unicode::Keywords>(), 24);
assert_eq!(core::mem::size_of::<Vec<extensions::other::Other>>(), 24);
assert_eq!(core::mem::size_of::<extensions::private::Private>(), 16);
assert_eq!(core::mem::size_of::<extensions::Extensions>(), 136);
assert_eq!(core::mem::size_of::<Locale>(), 168);
}
impl Locale {
/// The unknown locale "und".
pub const UNKNOWN: Self = crate::locale!("und");
/// A constructor which takes a utf8 slice, parses it and
/// produces a well-formed [`Locale`].
///
/// ✨ *Enabled with the `alloc` Cargo feature.*
///
/// # Examples
///
/// ```
/// use icu::locale::Locale;
///
/// Locale::try_from_str("en-US-u-hc-h12").unwrap();
/// ```
#[inline]
#[cfg(feature = "alloc")]
pub fn try_from_str(s: &str) -> Result<Self, ParseError> {
Self::try_from_utf8(s.as_bytes())
}
/// See [`Self::try_from_str`]
///
/// ✨ *Enabled with the `alloc` Cargo feature.*
#[cfg(feature = "alloc")]
pub fn try_from_utf8(code_units: &[u8]) -> Result<Self, ParseError> {
parse_locale(code_units)
}
/// Normalize the locale (operating on UTF-8 formatted byte slices)
///
/// This operation will normalize casing and the separator.
///
/// ✨ *Enabled with the `alloc` Cargo feature.*
///
/// # Examples
///
/// ```
/// use icu::locale::Locale;
///
/// assert_eq!(
/// Locale::normalize_utf8(b"pL-latn-pl-U-HC-H12").as_deref(),
/// Ok("pl-Latn-PL-u-hc-h12")
/// );
/// ```
#[cfg(feature = "alloc")]
pub fn normalize_utf8(input: &[u8]) -> Result<Cow<'_, str>, ParseError> {
let locale = Self::try_from_utf8(input)?;
Ok(writeable::to_string_or_borrow(&locale, input))
}
/// Normalize the locale (operating on strings)
///
/// This operation will normalize casing and the separator.
///
/// ✨ *Enabled with the `alloc` Cargo feature.*
///
/// # Examples
///
/// ```
/// use icu::locale::Locale;
///
/// assert_eq!(
/// Locale::normalize("pL-latn-pl-U-HC-H12").as_deref(),
/// Ok("pl-Latn-PL-u-hc-h12")
/// );
/// ```
#[cfg(feature = "alloc")]
pub fn normalize(input: &str) -> Result<Cow<'_, str>, ParseError> {
Self::normalize_utf8(input.as_bytes())
}
/// Compare this [`Locale`] with BCP-47 bytes.
///
/// The return value is equivalent to what would happen if you first converted this
/// [`Locale`] to a BCP-47 string and then performed a byte comparison.
///
/// This function is case-sensitive and results in a *total order*, so it is appropriate for
/// binary search. The only argument producing [`Ordering::Equal`] is `self.to_string()`.
///
/// # Examples
///
/// Sorting a list of locales with this method requires converting one of them to a string:
///
/// ```
/// use icu::locale::Locale;
/// use std::cmp::Ordering;
/// use writeable::Writeable;
///
/// // Random input order:
/// let bcp47_strings: &[&str] = &[
/// "und-u-ca-hebrew",
/// "ar-Latn",
/// "zh-Hant-TW",
/// "zh-TW",
/// "und-fonipa",
/// "zh-Hant",
/// "ar-SA",
/// ];
///
/// let mut locales = bcp47_strings
/// .iter()
/// .map(|s| s.parse().unwrap())
/// .collect::<Vec<Locale>>();
/// locales.sort_by(|a, b| {
/// let b = b.write_to_string();
/// a.strict_cmp(b.as_bytes())
/// });
/// let strict_cmp_strings = locales
/// .iter()
/// .map(|l| l.to_string())
/// .collect::<Vec<String>>();
///
/// // Output ordering, sorted alphabetically
/// let expected_ordering: &[&str] = &[
/// "ar-Latn",
/// "ar-SA",
/// "und-fonipa",
/// "und-u-ca-hebrew",
/// "zh-Hant",
/// "zh-Hant-TW",
/// "zh-TW",
/// ];
///
/// assert_eq!(expected_ordering, strict_cmp_strings);
/// ```
pub fn strict_cmp(&self, other: &[u8]) -> Ordering {
writeable::cmp_utf8(self, other)
}
#[expect(clippy::type_complexity)]
pub(crate) fn as_tuple(
&self,
) -> (
(
subtags::Language,
Option<subtags::Script>,
Option<subtags::Region>,
&subtags::Variants,
),
(
(
&extensions::unicode::Attributes,
&extensions::unicode::Keywords,
),
(
Option<(
subtags::Language,
Option<subtags::Script>,
Option<subtags::Region>,
&subtags::Variants,
)>,
&extensions::transform::Fields,
),
&extensions::private::Private,
&[extensions::other::Other],
),
) {
(self.id.as_tuple(), self.extensions.as_tuple())
}
/// Returns an ordering suitable for use in [`BTreeSet`].
///
/// Unlike [`Locale::strict_cmp`], the ordering may or may not be equivalent
/// to string ordering, and it may or may not be stable across ICU4X releases.
///
/// # Examples
///
/// This method returns a nonsensical ordering derived from the fields of the struct:
///
/// ```
/// use icu::locale::Locale;
/// use std::cmp::Ordering;
///
/// // Input strings, sorted alphabetically
/// let bcp47_strings: &[&str] = &[
/// "ar-Latn",
/// "ar-SA",
/// "und-fonipa",
/// "und-u-ca-hebrew",
/// "zh-Hant",
/// "zh-Hant-TW",
/// "zh-TW",
/// ];
/// assert!(bcp47_strings.windows(2).all(|w| w[0] < w[1]));
///
/// let mut locales = bcp47_strings
/// .iter()
/// .map(|s| s.parse().unwrap())
/// .collect::<Vec<Locale>>();
/// locales.sort_by(Locale::total_cmp);
/// let total_cmp_strings = locales
/// .iter()
/// .map(|l| l.to_string())
/// .collect::<Vec<String>>();
///
/// // Output ordering, sorted arbitrarily
/// let expected_ordering: &[&str] = &[
/// "ar-SA",
/// "ar-Latn",
/// "und-u-ca-hebrew",
/// "und-fonipa",
/// "zh-TW",
/// "zh-Hant",
/// "zh-Hant-TW",
/// ];
///
/// assert_eq!(expected_ordering, total_cmp_strings);
/// ```
///
/// Use a wrapper to add a [`Locale`] to a [`BTreeSet`]:
///
/// ```no_run
/// use icu::locale::Locale;
/// use std::cmp::Ordering;
/// use std::collections::BTreeSet;
///
/// #[derive(PartialEq, Eq)]
/// struct LocaleTotalOrd(Locale);
///
/// impl Ord for LocaleTotalOrd {
/// fn cmp(&self, other: &Self) -> Ordering {
/// self.0.total_cmp(&other.0)
/// }
/// }
///
/// impl PartialOrd for LocaleTotalOrd {
/// fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
/// Some(self.cmp(other))
/// }
/// }
///
/// let _: BTreeSet<LocaleTotalOrd> = unimplemented!();
/// ```
///
/// [`BTreeSet`]: alloc::collections::BTreeSet
pub fn total_cmp(&self, other: &Self) -> Ordering {
self.as_tuple().cmp(&other.as_tuple())
}
/// Compare this `Locale` with a potentially unnormalized BCP-47 string.
///
/// The return value is equivalent to what would happen if you first parsed the
/// BCP-47 string to a `Locale` and then performed a structural comparison.
///
/// ✨ *Enabled with the `alloc` Cargo feature.*
///
/// # Examples
///
/// ```
/// use icu::locale::Locale;
///
/// let bcp47_strings: &[&str] = &[
/// "pl-LaTn-pL",
/// "uNd",
/// "UND-FONIPA",
/// "UnD-t-m0-TrUe",
/// "uNd-u-CA-Japanese",
/// "ZH",
/// ];
///
/// for a in bcp47_strings {
/// assert!(a.parse::<Locale>().unwrap().normalizing_eq(a));
/// }
/// ```
#[cfg(feature = "alloc")]
pub fn normalizing_eq(&self, other: &str) -> bool {
macro_rules! subtag_matches {
($T:ty, $iter:ident, $expected:expr) => {
$iter
.next()
.map(|b| <$T>::try_from_utf8(b) == Ok($expected))
.unwrap_or(false)
};
}
let mut iter = SubtagIterator::new(other.as_bytes());
if !subtag_matches!(subtags::Language, iter, self.id.language) {
return false;
}
if let Some(ref script) = self.id.script {
if !subtag_matches!(subtags::Script, iter, *script) {
return false;
}
}
if let Some(ref region) = self.id.region {
if !subtag_matches!(subtags::Region, iter, *region) {
return false;
}
}
for variant in self.id.variants.iter() {
if !subtag_matches!(subtags::Variant, iter, *variant) {
return false;
}
}
if !self.extensions.is_empty() {
match extensions::Extensions::try_from_iter(&mut iter) {
Ok(exts) => {
if self.extensions != exts {
return false;
}
}
Err(_) => {
return false;
}
}
}
iter.next().is_none()
}
#[doc(hidden)] // macro use
#[expect(clippy::type_complexity)]
pub const fn try_from_utf8_with_single_variant_single_keyword_unicode_extension(
code_units: &[u8],
) -> Result<
(
subtags::Language,
Option<subtags::Script>,
Option<subtags::Region>,
Option<subtags::Variant>,
Option<(extensions::unicode::Key, Option<Subtag>)>,
),
ParseError,
> {
parse_locale_with_single_variant_single_keyword_unicode_keyword_extension(
code_units,
ParserMode::Locale,
)
}
pub(crate) fn for_each_subtag_str<E, F>(&self, f: &mut F) -> Result<(), E>
where
F: FnMut(&str) -> Result<(), E>,
{
self.id.for_each_subtag_str(f)?;
self.extensions.for_each_subtag_str(f)?;
Ok(())
}
}
/// ✨ *Enabled with the `alloc` Cargo feature.*
#[cfg(feature = "alloc")]
impl FromStr for Locale {
type Err = ParseError;
#[inline]
fn from_str(s: &str) -> Result<Self, Self::Err> {
Self::try_from_str(s)
}
}
impl From<LanguageIdentifier> for Locale {
fn from(id: LanguageIdentifier) -> Self {
Self {
id,
extensions: extensions::Extensions::default(),
}
}
}
impl From<Locale> for LanguageIdentifier {
fn from(loc: Locale) -> Self {
loc.id
}
}
impl core::fmt::Debug for Locale {
fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
writeable::Writeable::write_to(self, f)
}
}
impl_writeable_for_each_subtag_str_no_test!(Locale, selff, selff.extensions.is_empty() => selff.id.writeable_borrow());
#[test]
fn test_writeable() {
use writeable::assert_writeable_eq;
assert_writeable_eq!(Locale::UNKNOWN, "und");
assert_writeable_eq!("und-001".parse::<Locale>().unwrap(), "und-001");
assert_writeable_eq!("und-Mymr".parse::<Locale>().unwrap(), "und-Mymr");
assert_writeable_eq!("my-Mymr-MM".parse::<Locale>().unwrap(), "my-Mymr-MM");
assert_writeable_eq!(
"my-Mymr-MM-posix".parse::<Locale>().unwrap(),
"my-Mymr-MM-posix",
);
assert_writeable_eq!(
"zh-macos-posix".parse::<Locale>().unwrap(),
"zh-macos-posix",
);
assert_writeable_eq!(
"my-t-my-d0-zawgyi".parse::<Locale>().unwrap(),
"my-t-my-d0-zawgyi",
);
assert_writeable_eq!(
"ar-SA-u-ca-islamic-civil".parse::<Locale>().unwrap(),
"ar-SA-u-ca-islamic-civil",
);
assert_writeable_eq!(
"en-001-x-foo-bar".parse::<Locale>().unwrap(),
"en-001-x-foo-bar",
);
assert_writeable_eq!("und-t-m0-true".parse::<Locale>().unwrap(), "und-t-m0-true",);
}
/// # Examples
///
/// ```
/// use icu::locale::Locale;
/// use icu::locale::{locale, subtags::language};
///
/// assert_eq!(Locale::from(language!("en")), locale!("en"));
/// ```
impl From<subtags::Language> for Locale {
fn from(language: subtags::Language) -> Self {
Self {
id: language.into(),
extensions: extensions::Extensions::new(),
}
}
}
/// # Examples
///
/// ```
/// use icu::locale::Locale;
/// use icu::locale::{locale, subtags::script};
///
/// assert_eq!(Locale::from(Some(script!("latn"))), locale!("und-Latn"));
/// ```
impl From<Option<subtags::Script>> for Locale {
fn from(script: Option<subtags::Script>) -> Self {
Self {
id: script.into(),
extensions: extensions::Extensions::new(),
}
}
}
/// # Examples
///
/// ```
/// use icu::locale::Locale;
/// use icu::locale::{locale, subtags::region};
///
/// assert_eq!(Locale::from(Some(region!("US"))), locale!("und-US"));
/// ```
impl From<Option<subtags::Region>> for Locale {
fn from(region: Option<subtags::Region>) -> Self {
Self {
id: region.into(),
extensions: extensions::Extensions::new(),
}
}
}
/// # Examples
///
/// ```
/// use icu::locale::Locale;
/// use icu::locale::{
/// locale,
/// subtags::{language, region, script},
/// };
///
/// assert_eq!(
/// Locale::from((
/// language!("en"),
/// Some(script!("Latn")),
/// Some(region!("US"))
/// )),
/// locale!("en-Latn-US")
/// );
/// ```
impl
From<(
subtags::Language,
Option<subtags::Script>,
Option<subtags::Region>,
)> for Locale
{
fn from(
lsr: (
subtags::Language,
Option<subtags::Script>,
Option<subtags::Region>,
),
) -> Self {
Self {
id: lsr.into(),
extensions: extensions::Extensions::new(),
}
}
}

185
vendor/icu_locale_core/src/macros.rs vendored Normal file
View File

@@ -0,0 +1,185 @@
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
/// A macro allowing for compile-time construction of valid [`LanguageIdentifier`]s.
///
/// The macro will perform syntax normalization of the tag.
///
/// # Examples
///
/// ```
/// use icu::locale::{langid, LanguageIdentifier};
///
/// const DE_AT: LanguageIdentifier = langid!("de-at");
///
/// let de_at: LanguageIdentifier = "de-at".parse().unwrap();
///
/// assert_eq!(DE_AT, de_at);
/// ```
///
/// *Note*: The macro cannot produce language identifiers with more than one variants due to const
/// limitations (see [`Heap Allocations in Constants`]):
///
/// ```compile_fail,E0080
/// icu::locale::langid!("und-variant1-variant2");
/// ```
///
/// Use runtime parsing instead:
/// ```
/// "und-variant1-variant2"
/// .parse::<icu::locale::LanguageIdentifier>()
/// .unwrap();
/// ```
///
/// [`LanguageIdentifier`]: crate::LanguageIdentifier
/// [`Heap Allocations in Constants`]: https://github.com/rust-lang/const-eval/issues/20
#[macro_export]
macro_rules! langid {
($langid:literal) => { const {
match $crate::LanguageIdentifier::try_from_utf8_with_single_variant($langid.as_bytes()) {
Ok((language, script, region, variant)) => $crate::LanguageIdentifier {
language,
script,
region,
variants: match variant {
Some(v) => $crate::subtags::Variants::from_variant(v),
None => $crate::subtags::Variants::new(),
}
},
_ => panic!(concat!("Invalid language code: ", $langid, " . Note langid! macro can only support up to a single variant tag. Use runtime parsing instead.")),
}
}};
}
/// A macro allowing for compile-time construction of valid [`Locale`]s.
///
/// The macro will perform syntax normalization of the tag.
///
/// # Examples
///
/// ```
/// use icu::locale::{locale, Locale};
///
/// const DE_AT: Locale = locale!("de-at");
///
/// let de_at: Locale = "de-at".parse().unwrap();
///
/// assert_eq!(DE_AT, de_at);
/// ```
///
/// *Note*: The macro cannot produce locales with more than one variant or multiple extensions
/// (only single keyword unicode extension is supported) due to const
/// limitations (see [`Heap Allocations in Constants`]):
///
/// ```compile_fail,E0080
/// icu::locale::locale!("sl-IT-rozaj-biske-1994");
/// ```
/// Use runtime parsing instead:
/// ```
/// "sl-IT-rozaj-biske-1994"
/// .parse::<icu::locale::Locale>()
/// .unwrap();
/// ```
///
/// Locales with multiple keys are not supported
/// ```compile_fail,E0080
/// icu::locale::locale!("th-TH-u-ca-buddhist-nu-thai");
/// ```
/// Use runtime parsing instead:
/// ```
/// "th-TH-u-ca-buddhist-nu-thai"
/// .parse::<icu::locale::Locale>()
/// .unwrap();
/// ```
///
/// Locales with attributes are not supported
/// ```compile_fail,E0080
/// icu::locale::locale!("en-US-u-foobar-ca-buddhist");
/// ```
/// Use runtime parsing instead:
/// ```
/// "en-US-u-foobar-ca-buddhist"
/// .parse::<icu::locale::Locale>()
/// .unwrap();
/// ```
///
/// Locales with single key but multiple types are not supported
/// ```compile_fail,E0080
/// icu::locale::locale!("en-US-u-ca-islamic-umalqura");
/// ```
/// Use runtime parsing instead:
/// ```
/// "en-US-u-ca-islamic-umalqura"
/// .parse::<icu::locale::Locale>()
/// .unwrap();
/// ```
/// [`Locale`]: crate::Locale
/// [`Heap Allocations in Constants`]: https://github.com/rust-lang/const-eval/issues/20
#[macro_export]
macro_rules! locale {
($locale:literal) => { const {
match $crate::Locale::try_from_utf8_with_single_variant_single_keyword_unicode_extension(
$locale.as_bytes(),
) {
Ok((language, script, region, variant, keyword)) => $crate::Locale {
id: $crate::LanguageIdentifier {
language,
script,
region,
variants: match variant {
Some(v) => $crate::subtags::Variants::from_variant(v),
None => $crate::subtags::Variants::new(),
},
},
extensions: match keyword {
Some(k) => $crate::extensions::Extensions::from_unicode(
$crate::extensions::unicode::Unicode {
keywords: $crate::extensions::unicode::Keywords::new_single(
k.0,
$crate::extensions::unicode::Value::from_subtag(k.1),
),
attributes: $crate::extensions::unicode::Attributes::new(),
},
),
None => $crate::extensions::Extensions::new(),
},
},
_ => panic!(concat!(
"Invalid language code: ",
$locale,
" . Note the locale! macro only supports up to one variant tag; \
and one unicode keyword, other extension are \
not supported. Use runtime parsing instead."
)),
}
}};
}
#[cfg(test)]
mod test {
use crate::LanguageIdentifier;
use crate::Locale;
#[test]
fn test_langid_macro_can_parse_langid_with_single_variant() {
const DE_AT_FOOBAR: LanguageIdentifier = langid!("de-at-foobar");
let de_at_foobar: LanguageIdentifier = "de-at-foobar".parse().unwrap();
assert_eq!(DE_AT_FOOBAR, de_at_foobar);
}
#[test]
fn test_locale_macro_can_parse_locale_with_single_variant() {
const DE_AT_FOOBAR: Locale = locale!("de-at-foobar");
let de_at_foobar: Locale = "de-at-foobar".parse().unwrap();
assert_eq!(DE_AT_FOOBAR, de_at_foobar);
}
#[test]
fn test_locale_macro_can_parse_locale_with_single_keyword_unicode_extension() {
const DE_AT_U_CA_FOOBAR: Locale = locale!("de-at-u-ca-foobar");
let de_at_u_ca_foobar: Locale = "de-at-u-ca-foobar".parse().unwrap();
assert_eq!(DE_AT_U_CA_FOOBAR, de_at_u_ca_foobar);
}
}

View File

@@ -0,0 +1,69 @@
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
use displaydoc::Display;
/// List of parser errors that can be generated
/// while parsing [`LanguageIdentifier`](crate::LanguageIdentifier), [`Locale`](crate::Locale),
/// [`subtags`](crate::subtags) or [`extensions`](crate::extensions).
#[derive(Display, Debug, PartialEq, Copy, Clone)]
#[non_exhaustive]
pub enum ParseError {
/// Invalid language subtag.
///
/// # Examples
///
/// ```
/// use icu::locale::subtags::Language;
/// use icu::locale::ParseError;
///
/// assert_eq!("x2".parse::<Language>(), Err(ParseError::InvalidLanguage));
/// ```
#[displaydoc("The given language subtag is invalid")]
InvalidLanguage,
/// Invalid script, region or variant subtag.
///
/// # Examples
///
/// ```
/// use icu::locale::subtags::Region;
/// use icu::locale::ParseError;
///
/// assert_eq!("#@2X".parse::<Region>(), Err(ParseError::InvalidSubtag));
/// ```
#[displaydoc("Invalid subtag")]
InvalidSubtag,
/// Invalid extension subtag.
///
/// # Examples
///
/// ```
/// use icu::locale::extensions::unicode::Key;
/// use icu::locale::ParseError;
///
/// assert_eq!("#@2X".parse::<Key>(), Err(ParseError::InvalidExtension));
/// ```
#[displaydoc("Invalid extension")]
InvalidExtension,
/// Duplicated extension.
///
/// # Examples
///
/// ```
/// use icu::locale::Locale;
/// use icu::locale::ParseError;
///
/// assert_eq!(
/// "und-u-hc-h12-u-ca-calendar".parse::<Locale>(),
/// Err(ParseError::DuplicatedExtension)
/// );
/// ```
#[displaydoc("Duplicated extension")]
DuplicatedExtension,
}
impl core::error::Error for ParseError {}

View File

@@ -0,0 +1,273 @@
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
pub use super::errors::ParseError;
use crate::extensions::unicode::{Attribute, Key, Value};
use crate::extensions::ExtensionType;
use crate::parser::SubtagIterator;
#[cfg(feature = "alloc")]
use crate::shortvec::ShortBoxSlice;
use crate::subtags::Subtag;
#[cfg(feature = "alloc")]
use crate::LanguageIdentifier;
use crate::{extensions, subtags};
#[derive(PartialEq, Clone, Copy)]
pub enum ParserMode {
LanguageIdentifier,
Locale,
#[allow(dead_code)]
Partial,
}
#[derive(PartialEq, Clone, Copy)]
enum ParserPosition {
Script,
Region,
Variant,
}
#[cfg(feature = "alloc")]
pub fn parse_language_identifier_from_iter(
iter: &mut SubtagIterator,
mode: ParserMode,
) -> Result<LanguageIdentifier, ParseError> {
let mut script = None;
let mut region = None;
let mut variants = ShortBoxSlice::new();
let language = if let Some(subtag) = iter.next() {
subtags::Language::try_from_utf8(subtag)?
} else {
return Err(ParseError::InvalidLanguage);
};
let mut position = ParserPosition::Script;
while let Some(subtag) = iter.peek() {
if mode != ParserMode::LanguageIdentifier && subtag.len() == 1 {
break;
}
if position == ParserPosition::Script {
if let Ok(s) = subtags::Script::try_from_utf8(subtag) {
script = Some(s);
position = ParserPosition::Region;
} else if let Ok(s) = subtags::Region::try_from_utf8(subtag) {
region = Some(s);
position = ParserPosition::Variant;
} else if let Ok(v) = subtags::Variant::try_from_utf8(subtag) {
if let Err(idx) = variants.binary_search(&v) {
variants.insert(idx, v);
}
position = ParserPosition::Variant;
} else if mode == ParserMode::Partial {
break;
} else {
return Err(ParseError::InvalidSubtag);
}
} else if position == ParserPosition::Region {
if let Ok(s) = subtags::Region::try_from_utf8(subtag) {
region = Some(s);
position = ParserPosition::Variant;
} else if let Ok(v) = subtags::Variant::try_from_utf8(subtag) {
if let Err(idx) = variants.binary_search(&v) {
variants.insert(idx, v);
}
position = ParserPosition::Variant;
} else if mode == ParserMode::Partial {
break;
} else {
return Err(ParseError::InvalidSubtag);
}
} else if let Ok(v) = subtags::Variant::try_from_utf8(subtag) {
if let Err(idx) = variants.binary_search(&v) {
variants.insert(idx, v);
} else {
return Err(ParseError::InvalidSubtag);
}
} else if mode == ParserMode::Partial {
break;
} else {
return Err(ParseError::InvalidSubtag);
}
iter.next();
}
Ok(LanguageIdentifier {
language,
script,
region,
variants: subtags::Variants::from_short_slice_unchecked(variants),
})
}
#[cfg(feature = "alloc")]
pub fn parse_language_identifier(
t: &[u8],
mode: ParserMode,
) -> Result<LanguageIdentifier, ParseError> {
let mut iter = SubtagIterator::new(t);
parse_language_identifier_from_iter(&mut iter, mode)
}
#[expect(clippy::type_complexity)]
pub const fn parse_locale_with_single_variant_single_keyword_unicode_extension_from_iter(
mut iter: SubtagIterator,
mode: ParserMode,
) -> Result<
(
subtags::Language,
Option<subtags::Script>,
Option<subtags::Region>,
Option<subtags::Variant>,
Option<(extensions::unicode::Key, Option<Subtag>)>,
),
ParseError,
> {
let language;
let mut script = None;
let mut region = None;
let mut variant = None;
let mut keyword = None;
if let (i, Some(subtag)) = iter.next_const() {
iter = i;
match subtags::Language::try_from_utf8(subtag) {
Ok(l) => language = l,
Err(e) => return Err(e),
}
} else {
return Err(ParseError::InvalidLanguage);
}
let mut position = ParserPosition::Script;
while let Some(subtag) = iter.peek() {
if !matches!(mode, ParserMode::LanguageIdentifier) && subtag.len() == 1 {
break;
}
if matches!(position, ParserPosition::Script) {
if let Ok(s) = subtags::Script::try_from_utf8(subtag) {
script = Some(s);
position = ParserPosition::Region;
} else if let Ok(r) = subtags::Region::try_from_utf8(subtag) {
region = Some(r);
position = ParserPosition::Variant;
} else if let Ok(v) = subtags::Variant::try_from_utf8(subtag) {
// We cannot handle multiple variants in a const context
debug_assert!(variant.is_none());
variant = Some(v);
position = ParserPosition::Variant;
} else if matches!(mode, ParserMode::Partial) {
break;
} else {
return Err(ParseError::InvalidSubtag);
}
} else if matches!(position, ParserPosition::Region) {
if let Ok(s) = subtags::Region::try_from_utf8(subtag) {
region = Some(s);
position = ParserPosition::Variant;
} else if let Ok(v) = subtags::Variant::try_from_utf8(subtag) {
// We cannot handle multiple variants in a const context
debug_assert!(variant.is_none());
variant = Some(v);
position = ParserPosition::Variant;
} else if matches!(mode, ParserMode::Partial) {
break;
} else {
return Err(ParseError::InvalidSubtag);
}
} else if let Ok(v) = subtags::Variant::try_from_utf8(subtag) {
debug_assert!(matches!(position, ParserPosition::Variant));
if variant.is_some() {
// We cannot handle multiple variants in a const context
return Err(ParseError::InvalidSubtag);
}
variant = Some(v);
} else if matches!(mode, ParserMode::Partial) {
break;
} else {
return Err(ParseError::InvalidSubtag);
}
iter = iter.next_const().0;
}
if matches!(mode, ParserMode::Locale) {
if let Some(subtag) = iter.peek() {
match ExtensionType::try_from_utf8(subtag) {
Ok(ExtensionType::Unicode) => {
iter = iter.next_const().0;
if let Some(peek) = iter.peek() {
if Attribute::try_from_utf8(peek).is_ok() {
// We cannot handle Attributes in a const context
return Err(ParseError::InvalidSubtag);
}
}
let mut key = None;
let mut current_type = None;
while let Some(peek) = iter.peek() {
if peek.len() == 2 {
if key.is_some() {
// We cannot handle more than one Key in a const context
return Err(ParseError::InvalidSubtag);
}
match Key::try_from_utf8(peek) {
Ok(k) => key = Some(k),
Err(e) => return Err(e),
};
} else if key.is_some() {
match Value::parse_subtag_from_utf8(peek) {
Ok(Some(t)) => {
if current_type.is_some() {
// We cannot handle more than one type in a const context
return Err(ParseError::InvalidSubtag);
}
current_type = Some(t);
}
Ok(None) => {}
Err(e) => return Err(e),
}
} else {
break;
}
iter = iter.next_const().0;
}
if let Some(k) = key {
keyword = Some((k, current_type));
}
}
// We cannot handle Transform, Private, Other extensions in a const context
Ok(_) => return Err(ParseError::InvalidSubtag),
Err(e) => return Err(e),
}
}
}
Ok((language, script, region, variant, keyword))
}
#[expect(clippy::type_complexity)]
pub const fn parse_language_identifier_with_single_variant(
t: &[u8],
mode: ParserMode,
) -> Result<
(
subtags::Language,
Option<subtags::Script>,
Option<subtags::Region>,
Option<subtags::Variant>,
),
ParseError,
> {
let iter = SubtagIterator::new(t);
match parse_locale_with_single_variant_single_keyword_unicode_extension_from_iter(iter, mode) {
Ok((l, s, r, v, _)) => Ok((l, s, r, v)),
Err(e) => Err(e),
}
}

View File

@@ -0,0 +1,42 @@
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
use crate::extensions;
use crate::parser::{ParseError, ParserMode, SubtagIterator};
use crate::subtags::{self, Subtag};
#[cfg(feature = "alloc")]
use crate::Locale;
use super::parse_locale_with_single_variant_single_keyword_unicode_extension_from_iter;
#[cfg(feature = "alloc")]
pub fn parse_locale(t: &[u8]) -> Result<Locale, ParseError> {
let mut iter = SubtagIterator::new(t);
let id = super::parse_language_identifier_from_iter(&mut iter, ParserMode::Locale)?;
let extensions = if iter.peek().is_some() {
extensions::Extensions::try_from_iter(&mut iter)?
} else {
extensions::Extensions::default()
};
Ok(Locale { id, extensions })
}
#[expect(clippy::type_complexity)]
pub const fn parse_locale_with_single_variant_single_keyword_unicode_keyword_extension(
t: &[u8],
mode: ParserMode,
) -> Result<
(
subtags::Language,
Option<subtags::Script>,
Option<subtags::Region>,
Option<subtags::Variant>,
Option<(extensions::unicode::Key, Option<Subtag>)>,
),
ParseError,
> {
let iter = SubtagIterator::new(t);
parse_locale_with_single_variant_single_keyword_unicode_extension_from_iter(iter, mode)
}

185
vendor/icu_locale_core/src/parser/mod.rs vendored Normal file
View File

@@ -0,0 +1,185 @@
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
pub mod errors;
mod langid;
mod locale;
pub use errors::ParseError;
pub use langid::*;
pub use locale::*;
// Safety-usable invariant: returns a prefix of `slice`
const fn skip_before_separator(slice: &[u8]) -> &[u8] {
let mut end = 0;
// Invariant: end ≤ slice.len() since len is a nonnegative integer and end is 0
#[expect(clippy::indexing_slicing)] // very protected, should optimize out
while end < slice.len() && !matches!(slice[end], b'-') {
// Invariant at beginning of loop: end < slice.len()
// Advance until we reach end of slice or a separator.
end += 1;
// Invariant at end of loop: end ≤ slice.len()
}
// Notice: this slice may be empty for cases like `"en-"` or `"en--US"`
// SAFETY: end ≤ slice.len() by while loop
// Safety-usable invariant upheld: returned a prefix of the slice
unsafe { slice.split_at_unchecked(end).0 }
}
// `SubtagIterator` is a helper iterator for [`LanguageIdentifier`] and [`Locale`] parsing.
//
// It is quite extraordinary due to focus on performance and Rust limitations for `const`
// functions.
//
// The iterator is eager and fallible allowing it to reject invalid slices such as `"-"`, `"-en"`,
// `"en-"` etc.
//
// The iterator provides methods available for static users - `next_manual` and `peek_manual`,
// as well as typical `Peekable` iterator APIs - `next` and `peek`.
//
// All methods return an `Option` of a `Result`.
#[derive(Copy, Clone, Debug)]
pub struct SubtagIterator<'a> {
remaining: &'a [u8],
// Safety invariant: current is a prefix of remaining
current: Option<&'a [u8]>,
}
impl<'a> SubtagIterator<'a> {
pub const fn new(rest: &'a [u8]) -> Self {
Self {
remaining: rest,
// Safety invariant upheld: skip_before_separator() returns a prefix of `rest`
current: Some(skip_before_separator(rest)),
}
}
pub const fn next_const(mut self) -> (Self, Option<&'a [u8]>) {
let Some(result) = self.current else {
return (self, None);
};
self.current = if result.len() < self.remaining.len() {
// If there is more after `result`, by construction `current` starts with a separator
// SAFETY: `self.remaining` is strictly longer than `result` due to `result` being a prefix (from the safety invariant)
self.remaining = unsafe { self.remaining.split_at_unchecked(result.len() + 1).1 };
// Safety invariant upheld: skip_before_separator() returns a prefix of `rest`, and we don't
// mutate self.remaining after this
Some(skip_before_separator(self.remaining))
} else {
None
};
(self, Some(result))
}
pub const fn peek(&self) -> Option<&'a [u8]> {
self.current
}
}
impl<'a> Iterator for SubtagIterator<'a> {
type Item = &'a [u8];
fn next(&mut self) -> Option<Self::Item> {
let (s, res) = self.next_const();
*self = s;
res
}
}
#[cfg(test)]
mod test {
use super::*;
fn slice_to_str(input: &[u8]) -> &str {
std::str::from_utf8(input).unwrap()
}
#[test]
fn subtag_iterator_peek_test() {
let slice = "de-at-u-ca-foobar";
let mut si = SubtagIterator::new(slice.as_bytes());
assert_eq!(si.peek().map(slice_to_str), Some("de"));
assert_eq!(si.peek().map(slice_to_str), Some("de"));
assert_eq!(si.next().map(slice_to_str), Some("de"));
assert_eq!(si.peek().map(slice_to_str), Some("at"));
assert_eq!(si.peek().map(slice_to_str), Some("at"));
assert_eq!(si.next().map(slice_to_str), Some("at"));
}
#[test]
fn subtag_iterator_test() {
let slice = "";
let mut si = SubtagIterator::new(slice.as_bytes());
assert_eq!(si.next().map(slice_to_str), Some(""));
let slice = "-";
let mut si = SubtagIterator::new(slice.as_bytes());
assert_eq!(si.next().map(slice_to_str), Some(""));
let slice = "-en";
let mut si = SubtagIterator::new(slice.as_bytes());
assert_eq!(si.next().map(slice_to_str), Some(""));
assert_eq!(si.next().map(slice_to_str), Some("en"));
assert_eq!(si.next(), None);
let slice = "en";
let si = SubtagIterator::new(slice.as_bytes());
assert_eq!(si.map(slice_to_str).collect::<Vec<_>>(), vec!["en",]);
let slice = "en-";
let si = SubtagIterator::new(slice.as_bytes());
assert_eq!(si.map(slice_to_str).collect::<Vec<_>>(), vec!["en", "",]);
let slice = "--";
let mut si = SubtagIterator::new(slice.as_bytes());
assert_eq!(si.next().map(slice_to_str), Some(""));
assert_eq!(si.next().map(slice_to_str), Some(""));
assert_eq!(si.next().map(slice_to_str), Some(""));
assert_eq!(si.next(), None);
let slice = "-en-";
let mut si = SubtagIterator::new(slice.as_bytes());
assert_eq!(si.next().map(slice_to_str), Some(""));
assert_eq!(si.next().map(slice_to_str), Some("en"));
assert_eq!(si.next().map(slice_to_str), Some(""));
assert_eq!(si.next(), None);
let slice = "de-at-u-ca-foobar";
let si = SubtagIterator::new(slice.as_bytes());
assert_eq!(
si.map(slice_to_str).collect::<Vec<_>>(),
vec!["de", "at", "u", "ca", "foobar",]
);
}
#[test]
fn skip_before_separator_test() {
let current = skip_before_separator(b"");
assert_eq!(current, b"");
let current = skip_before_separator(b"en");
assert_eq!(current, b"en");
let current = skip_before_separator(b"en-");
assert_eq!(current, b"en");
let current = skip_before_separator(b"en--US");
assert_eq!(current, b"en");
let current = skip_before_separator(b"-US");
assert_eq!(current, b"");
let current = skip_before_separator(b"US");
assert_eq!(current, b"US");
let current = skip_before_separator(b"-");
assert_eq!(current, b"");
}
}

View File

@@ -0,0 +1,23 @@
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
//! A set of extensions which correspond to preferences.
//!
//! The module provides structures that represent known values for each keyword
//! in Locale [`extensions`](crate::extensions) with semantic meaning.
//!
//! # Syntactic vs Semantic Extension Handling
//!
//! This module ensures that only valid, recognized values are used, providing semantic validation.
//! It would reject invalid values such as `-u-hc-BB` because `BB` is not a known hour cycle. This
//! is ideal for applications that require strict adherence to standardized values and need to
//! prevent invalid or unrecognized data.
//!
//! If you need to construct syntactically valid Locale extensions without semantic validation,
//! allowing any valid key-value pair regardless of recognition, consider using the
//! [`crate::extensions`] module.
//!
//! [`Locale`]: crate::Locale
pub mod unicode;

View File

@@ -0,0 +1,15 @@
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
//! Errors related to parsing of Preferences.
/// Error returned by parsers of unicode extensions as preferences.
#[non_exhaustive]
#[derive(Debug, displaydoc::Display)]
pub enum PreferencesParseError {
/// The given keyword value is not a valid preference variant.
InvalidKeywordValue,
}
impl core::error::Error for PreferencesParseError {}

View File

@@ -0,0 +1,66 @@
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
#![allow(non_snake_case)]
use crate::preferences::extensions::unicode::enum_keyword;
enum_keyword!(
/// Hijri Calendar sub-type
///
/// The list is based on [`CLDR Calendars`](https://github.com/unicode-org/cldr/blob/main/common/bcp47/calendar.xml)
HijriCalendarAlgorithm {
/// Hijri calendar, Umm al-Qura
Umalqura,
/// Hijri calendar, tabular (intercalary years \[2,5,7,10,13,16,18,21,24,26,29] - astronomical epoch)
Tbla,
/// Hijri calendar, tabular (intercalary years \[2,5,7,10,13,16,18,21,24,26,29] - civil epoch)
Civil,
/// Hijri calendar, Saudi Arabia sighting
Rgsa
});
enum_keyword!(
/// A Unicode Calendar Identifier defines a type of calendar.
///
/// This selects calendar-specific data within a locale used for formatting and parsing,
/// such as date/time symbols and patterns; it also selects supplemental calendarData used
/// for calendrical calculations. The value can affect the computation of the first day of the week.
///
/// The valid values are listed in [LDML](https://unicode.org/reports/tr35/#UnicodeCalendarIdentifier).
CalendarAlgorithm {
/// Thai Buddhist calendar (same as Gregorian except for the year)
("buddhist" => Buddhist),
/// Traditional Chinese calendar
("chinese" => Chinese),
/// Coptic calendar
("coptic" => Coptic),
/// Traditional Korean calendar
("dangi" => Dangi),
/// Ethiopic calendar, Amete Alem (epoch approx. 5493 B.C.E)
("ethioaa" => Ethioaa),
/// Ethiopic calendar, Amete Mihret (epoch approx, 8 C.E.)
("ethiopic" => Ethiopic),
/// Gregorian calendar
("gregory" => Gregory),
/// Traditional Hebrew calendar
("hebrew" => Hebrew),
/// Indian calendar
("indian" => Indian),
/// Hijri calendar
("islamic" => Hijri(HijriCalendarAlgorithm) {
("umalqura" => Umalqura),
("tbla" => Tbla),
("civil" => Civil),
("rgsa" => Rgsa)
}),
/// ISO calendar (Gregorian calendar using the ISO 8601 calendar week rules)
("iso8601" => Iso8601),
/// Japanese Imperial calendar
("japanese" => Japanese),
/// Persian calendar
("persian" => Persian),
/// Republic of China calendar
("roc" => Roc)
}, "ca", s, if *s == value!("islamicc") { return Ok(Self::Hijri(Some(HijriCalendarAlgorithm::Civil))); });

View File

@@ -0,0 +1,75 @@
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
use crate::preferences::extensions::unicode::enum_keyword;
enum_keyword!(
/// A Unicode Collation Identifier defines a type of collation (sort order).
///
/// The valid values are listed in [LDML](https://unicode.org/reports/tr35/#UnicodeCollationIdentifier).
CollationType {
/// A previous version of the ordering, for compatibility
("compat" => Compat),
/// Dictionary style ordering (such as in Sinhala)
("dict" => Dict),
/// The default Unicode collation element table order
("ducet" => Ducet),
/// Recommended ordering for emoji characters
("emoji" => Emoji),
/// European ordering rules
("eor" => Eor),
/// Phonebook style ordering (such as in German)
("phonebk" => Phonebk),
/// Phonetic ordering (sorting based on pronunciation)
("phonetic" => Phonetic),
/// Pinyin ordering for Latin and for CJK characters (used in Chinese)
("pinyin" => Pinyin),
/// Special collation type for string search
("search" => Search),
/// Special collation type for Korean initial consonant search
("searchjl" => Searchjl),
/// Default ordering for each language
("standard" => Standard),
/// Pinyin ordering for Latin, stroke order for CJK characters (used in Chinese)
("stroke" => Stroke),
/// Traditional style ordering (such as in Spanish)
("trad" => Trad),
/// Pinyin ordering for Latin, Unihan radical-stroke ordering for CJK characters (used in Chinese)
("unihan" => Unihan),
/// Pinyin ordering for Latin, zhuyin order for Bopomofo and CJK characters (used in Chinese)
("zhuyin" => Zhuyin),
}, "co");
enum_keyword!(
/// Collation parameter key for ordering by case.
///
/// If set to upper, causes upper case to sort before lower case. If set to lower, causes lower case to sort before upper case.
/// Useful for locales that have already supported ordering but require different order of cases. Affects case and tertiary levels.
///
/// The defails see [LDML](https://unicode.org/reports/tr35/tr35-collation.html#Case_Parameters).
[Default]
CollationCaseFirst {
/// Upper case to be sorted before lower case
("upper" => Upper),
/// Lower case to be sorted before upper case
("lower" => Lower),
/// No special case ordering
[default]
("false" => False),
}, "kf");
enum_keyword!(
/// Collation parameter key for numeric handling.
///
/// If set to on, any sequence of Decimal Digits (General_Category = Nd in the UAX44) is sorted at a primary level with
/// its numeric value. For example, "1" < "2" < "10". The computed primary weights are all at the start of the digit
/// reordering group.
[Default]
CollationNumericOrdering {
/// A sequence of decimal digits is sorted at primary level with its numeric value
("true" => True),
/// No special handling for numeric ordering
[default]
("false" => False),
}, "kn");

View File

@@ -0,0 +1,31 @@
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
use crate::preferences::extensions::unicode::errors::PreferencesParseError;
use crate::preferences::extensions::unicode::struct_keyword;
use crate::{extensions::unicode::Value, subtags::Subtag};
use tinystr::TinyAsciiStr;
struct_keyword!(
/// A Unicode Currency Identifier defines a type of currency.
///
/// The valid values are listed in [LDML](https://unicode.org/reports/tr35/#UnicodeCurrencyIdentifier).
CurrencyType,
"cu",
TinyAsciiStr<3>,
|input: Value| {
if let Some(subtag) = input.into_single_subtag() {
let ts = subtag.as_tinystr();
if ts.len() == 3 && ts.is_ascii_alphabetic() {
return Ok(Self(ts.resize()));
}
}
Err(PreferencesParseError::InvalidKeywordValue)
},
|input: CurrencyType| {
crate::extensions::unicode::Value::from_subtag(Some(
Subtag::from_tinystr_unvalidated(input.0.resize()),
))
}
);

View File

@@ -0,0 +1,18 @@
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
use crate::preferences::extensions::unicode::enum_keyword;
enum_keyword!(
/// A Unicode Currency Format Identifier defines a style for currency formatting.
///
/// The valid values are listed in [LDML](https://unicode.org/reports/tr35/#UnicodeCurrencyFormatIdentifier).
[Default]
CurrencyFormatStyle {
/// Negative numbers use the minusSign symbol (the default)
[default]
("standard" => Standard),
/// Negative numbers use parentheses or equivalent
("account" => Account)
}, "cf");

View File

@@ -0,0 +1,34 @@
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
use crate::extensions::unicode::Value;
use crate::preferences::extensions::unicode::errors::PreferencesParseError;
use crate::preferences::extensions::unicode::struct_keyword;
use crate::subtags::Script;
use alloc::vec::Vec;
use core::str::FromStr;
struct_keyword!(
/// A Unicode Dictionary Break Exclusion Identifier specifies
/// scripts to be excluded from dictionary-based text break (for words and lines).
///
/// The valid values are of one or more items of type [`Script`](crate::subtags::Script).
///
/// ✨ *Enabled with the `alloc` Cargo feature.*
DictionaryBreakScriptExclusions,
"dx",
Vec<Script>,
|input: Value| {
input
.into_iter()
.map(|s| {
Script::from_str(s.as_str()).map_err(|_| PreferencesParseError::InvalidKeywordValue)
})
.collect::<Result<_, _>>()
.map(Self)
},
|input: DictionaryBreakScriptExclusions| {
input.0.into_iter().map(Into::into).collect()
}
);

View File

@@ -0,0 +1,23 @@
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
use crate::preferences::extensions::unicode::enum_keyword;
enum_keyword!(
/// A Unicode Emoji Presentation Style Identifier
///
/// It specifies a request for the preferred emoji
/// presentation style. This can be used as part of the value for an HTML lang attribute,
/// for example `<html lang="sr-Latn-u-em-emoji">`.
/// The valid values are listed in [LDML](https://unicode.org/reports/tr35/#UnicodeEmojiPresentationStyleIdentifier).
[Default]
EmojiPresentationStyle {
/// Use an emoji presentation for emoji characters if possible
("emoji" => Emoji),
/// Use a text presentation for emoji characters if possible
("text" => Text),
/// Use the default presentation for emoji characters as specified in UTR #51 Presentation Style
[default]
("default" => Default)
}, "em");

View File

@@ -0,0 +1,29 @@
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
use crate::preferences::extensions::unicode::enum_keyword;
enum_keyword!(
/// A Unicode First Day Identifier defines the preferred first day of the week for calendar display.
///
/// Specifying "fw" in a locale identifier overrides the default value specified by
/// supplemental week data for the region.
///
/// The valid values are listed in [LDML](https://unicode.org/reports/tr35/#UnicodeFirstDayIdentifier).
FirstDay {
/// Sunday
("sun" => Sun),
/// Monday
("mon" => Mon),
/// Tuesday
("tue" => Tue),
/// Wednesday
("wed" => Wed),
/// Thursday
("thu" => Thu),
/// Friday
("fri" => Fri),
/// Saturday
("sat" => Sat)
}, "fw");

View File

@@ -0,0 +1,18 @@
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
use crate::preferences::extensions::unicode::enum_keyword;
enum_keyword!(
/// A Unicode Hour Cycle Identifier defines the preferred time cycle. Specifying "hc" in a locale identifier overrides the default value specified by supplemental time data for the region.
///
/// The valid values are listed in [LDML](https://unicode.org/reports/tr35/#UnicodeHourCycleIdentifier).
HourCycle {
/// The typical 12-hour clock. Hours are numbered 112. Corresponds to 'h' in patterns.
("h12" => H12),
/// The 24-hour clock. Hour are numbered 023. Corresponds to 'H' in patterns.
("h23" => H23),
/// Variant of the 12-hour clock, sometimes used in Japan. Hours are numbered 011. Corresponds to 'K' in patterns.
("h11" => H11),
}, "hc");

View File

@@ -0,0 +1,21 @@
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
use crate::preferences::extensions::unicode::enum_keyword;
enum_keyword!(
/// A Unicode Line Break Style Identifier defines a preferred line break style corresponding to the CSS level 3 line-break option.
///
/// Specifying "lb" in a locale identifier overrides the locales default style
/// (which may correspond to "normal" or "strict").
///
/// The valid values are listed in [LDML](https://unicode.org/reports/tr35/#UnicodeLineBreakStyleIdentifier).
LineBreakStyle {
/// CSS level 3 line-break=strict, e.g. treat CJ as NS
("strict" => Strict),
/// CSS level 3 line-break=normal, e.g. treat CJ as ID, break before hyphens for ja,zh
("normal" => Normal),
/// CSS lev 3 line-break=loose
("loose" => Loose),
}, "lb");

View File

@@ -0,0 +1,23 @@
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
use crate::preferences::extensions::unicode::enum_keyword;
enum_keyword!(
/// A Unicode Line Break Word Identifier defines preferred line break word handling behavior corresponding to the CSS level 3 word-break option.
///
/// Specifying "lw" in a locale identifier overrides the locales default style (which may correspond to "normal" or "keepall").
///
/// The valid values are listed in [LDML](https://unicode.org/reports/tr35/#UnicodeLineBreakWordIdentifier).
LineBreakWordHandling {
/// CSS lev 3 word-break=normal, normal script/language behavior for midword breaks
("normal" => Normal),
/// CSS lev 3 word-break=break-all, allow midword breaks unless forbidden by lb setting
("breakall" => BreakAll),
/// CSS lev 3 word-break=keep-all, prohibit midword breaks except for dictionary breaks
("keepall" => KeepAll),
/// Prioritize keeping natural phrases (of multiple words) together when breaking,
/// used in short text like title and headline
("phrase" => Phrase),
}, "lw");

View File

@@ -0,0 +1,20 @@
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
use crate::preferences::extensions::unicode::enum_keyword;
enum_keyword!(
/// A Unicode Measurement System Identifier defines a preferred measurement system.
///
/// Specifying "ms" in a locale identifier overrides the default value specified by supplemental measurement system data for the region
///
/// The valid values are listed in [LDML](https://unicode.org/reports/tr35/#UnicodeMeasurementSystemIdentifier).
MeasurementSystem {
/// Metric System
("metric" => Metric),
/// US System of measurement: feet, pints, etc.; pints are 16oz
("ussystem" => USSystem),
/// UK System of measurement: feet, pints, etc.; pints are 20oz
("uksystem" => UKSystem)
}, "ms");

View File

@@ -0,0 +1,18 @@
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
use crate::preferences::extensions::unicode::enum_keyword;
enum_keyword!(
/// A Measurement Unit Preference Override defines an override for measurement unit preference.
///
/// The valid values are listed in [LDML](https://unicode.org/reports/tr35/#MeasurementUnitPreferenceOverride).
MeasurementUnitOverride {
/// Celsius as temperature unit
("celsius" => Celsius),
/// Kelvin as temperature unit
("kelvin" => Kelvin),
/// Fahrenheit as temperature unit
("fahrenhe" => Fahrenheit),
}, "mu");

View File

@@ -0,0 +1,46 @@
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
//! A list of Preferences derived from Locale unicode extension keywords.
#![allow(unused_imports)]
mod calendar;
pub use calendar::*;
mod collation;
pub use collation::*;
mod currency;
pub use currency::*;
mod currency_format;
pub use currency_format::*;
#[cfg(feature = "alloc")]
mod dictionary_break;
#[cfg(feature = "alloc")]
pub use dictionary_break::*;
mod emoji;
pub use emoji::*;
mod first_day;
pub use first_day::*;
mod hour_cycle;
pub use hour_cycle::*;
mod line_break;
pub use line_break::*;
mod line_break_word;
pub use line_break_word::*;
mod measurement_system;
pub use measurement_system::*;
mod measurement_unit_override;
pub use measurement_unit_override::*;
mod numbering_system;
pub use numbering_system::*;
mod region_override;
pub use region_override::*;
mod regional_subdivision;
pub use regional_subdivision::*;
mod sentence_supression;
pub use sentence_supression::*;
mod timezone;
pub use timezone::*;
mod variant;
pub use variant::*;

View File

@@ -0,0 +1,26 @@
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
use crate::preferences::extensions::unicode::errors::PreferencesParseError;
use crate::preferences::extensions::unicode::struct_keyword;
use crate::{extensions::unicode::Value, subtags::Subtag};
struct_keyword!(
/// A Unicode Number System Identifier defines a type of number system.
///
/// The valid values are listed in [LDML](https://unicode.org/reports/tr35/#UnicodeNumberSystemIdentifier).
[Copy]
NumberingSystem,
"nu",
Subtag,
|input: Value| {
input
.into_single_subtag()
.map(Self)
.ok_or(PreferencesParseError::InvalidKeywordValue)
},
|input: NumberingSystem| {
crate::extensions::unicode::Value::from_subtag(Some(input.0))
}
);

View File

@@ -0,0 +1,63 @@
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
use crate::extensions::unicode::{SubdivisionId, Value};
use crate::preferences::extensions::unicode::errors::PreferencesParseError;
use crate::preferences::extensions::unicode::struct_keyword;
struct_keyword!(
/// A Region Override specifies an alternate region to use for obtaining certain region-specific default values.
///
/// The valid values are listed in [LDML](https://unicode.org/reports/tr35/#RegionOverride).
[Copy]
RegionOverride,
"rg",
SubdivisionId,
|input: Value| {
input
.into_single_subtag()
.and_then(|subtag| subtag.as_str().parse().ok().map(Self))
.ok_or(PreferencesParseError::InvalidKeywordValue)
},
|input: RegionOverride| {
Value::from_subtag(Some(input.0.into_subtag()))
}
);
#[cfg(test)]
mod test {
use super::*;
use crate::extensions::unicode;
use crate::extensions::unicode::subdivision_suffix;
use crate::subtags::region;
#[test]
fn region_override_test() {
let val = unicode::value!("uksct");
let rg: RegionOverride = val.try_into().unwrap();
assert_eq!(rg.0.region, region!("UK"));
assert_eq!(rg.0.suffix, subdivision_suffix!("sct"));
let val = unicode::value!("usca");
let rg: RegionOverride = val.try_into().unwrap();
assert_eq!(rg.0.region, region!("US"));
assert_eq!(rg.0.suffix, subdivision_suffix!("ca"));
let val = unicode::value!("419bel");
let rg: RegionOverride = val.try_into().unwrap();
assert_eq!(rg.0.region, region!("419"));
assert_eq!(rg.0.suffix, subdivision_suffix!("bel"));
let val = unicode::value!("uszzzz");
let rg: RegionOverride = val.try_into().unwrap();
assert_eq!(rg.0.region, region!("us"));
assert_eq!(rg.0.suffix, subdivision_suffix!("zzzz"));
for i in &["4aabel", "a4bel", "ukabcde"] {
let val = unicode::Value::try_from_str(i).unwrap();
let rg: Result<RegionOverride, _> = val.try_into();
assert!(rg.is_err());
}
}
}

View File

@@ -0,0 +1,65 @@
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
use crate::preferences::extensions::unicode::errors::PreferencesParseError;
use crate::preferences::extensions::unicode::struct_keyword;
use crate::{
extensions::unicode::{SubdivisionId, Value},
subtags::Subtag,
};
struct_keyword!(
/// A Unicode Subdivision Identifier defines a regional subdivision used for locales.
///
/// The valid values are listed in [LDML](https://unicode.org/reports/tr35/#UnicodeSubdivisionIdentifier).
[Copy]
RegionalSubdivision,
"sd",
SubdivisionId,
|input: Value| {
input
.into_single_subtag()
.and_then(|subtag| subtag.as_str().parse().ok().map(Self))
.ok_or(PreferencesParseError::InvalidKeywordValue)
},
|input: RegionalSubdivision| {
let mut raw = [0; 8];
raw[0] = input.0.region.into_raw()[0];
raw[1] = input.0.region.into_raw()[1];
raw[2] = input.0.region.into_raw()[2];
let len = input.0.region.as_str().len();
debug_assert!((2..=3).contains(&len));
#[allow(clippy::indexing_slicing)] // safe
{
raw[len] = input.0.suffix.into_raw()[0];
raw[len + 1] = input.0.suffix.into_raw()[1];
raw[len + 2] = input.0.suffix.into_raw()[2];
raw[len + 3] = input.0.suffix.into_raw()[2];
}
#[expect(clippy::unwrap_used)] // correct by construction
Value::from_subtag(Some(Subtag::try_from_raw(raw).unwrap()))
}
);
#[cfg(test)]
mod test {
use super::*;
use crate::extensions::unicode;
use crate::extensions::unicode::subdivision_suffix;
use crate::subtags::region;
#[test]
fn region_subdivision_test() {
let val = unicode::value!("uksct");
let rg: RegionalSubdivision = val.try_into().unwrap();
assert_eq!(rg.region, region!("UK"));
assert_eq!(rg.suffix, subdivision_suffix!("sct"));
for i in &["4aabel", "a4bel", "ukabcde"] {
let val = unicode::Value::try_from_str(i).unwrap();
let rg: Result<RegionalSubdivision, _> = val.try_into();
assert!(rg.is_err());
}
}
}

View File

@@ -0,0 +1,19 @@
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
use crate::preferences::extensions::unicode::enum_keyword;
enum_keyword!(
/// A Unicode Sentence Break Suppressions Identifier defines a set of data to be used for suppressing certain
/// sentence breaks that would otherwise be found by UAX #14 rules.
///
/// The valid values are listed in [LDML](https://unicode.org/reports/tr35/#UnicodeSentenceBreakSuppressionsIdentifier).
[Default]
SentenceBreakSupressions {
/// Dont use sentence break suppressions data (the default)
[default]
("none" => None),
/// Use sentence break suppressions data of type "standard"
("standard" => Standard),
}, "ss");

View File

@@ -0,0 +1,26 @@
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
use crate::preferences::extensions::unicode::errors::PreferencesParseError;
use crate::preferences::extensions::unicode::struct_keyword;
use crate::{extensions::unicode::Value, subtags::Subtag};
struct_keyword!(
/// A Unicode Timezone Identifier defines a timezone.
///
/// The valid values are listed in [LDML](https://unicode.org/reports/tr35/#UnicodeTimezoneIdentifier).
[Copy]
TimeZoneShortId,
"tz",
Subtag,
|input: Value| {
input
.into_single_subtag()
.map(Self)
.ok_or(PreferencesParseError::InvalidKeywordValue)
},
|input: TimeZoneShortId| {
crate::extensions::unicode::Value::from_subtag(Some(input.0))
}
);

View File

@@ -0,0 +1,14 @@
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
use crate::preferences::extensions::unicode::enum_keyword;
enum_keyword!(
/// A Unicode Variant Identifier defines a special variant used for locales.
///
/// The valid values are listed in [LDML](https://unicode.org/reports/tr35/#UnicodeVariantIdentifier).
CommonVariantType {
/// POSIX style locale variant
("posix" => Posix),
}, "va");

View File

@@ -0,0 +1,322 @@
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
/// Internal macro used by `enum_keyword` for nesting.
#[macro_export]
#[doc(hidden)]
macro_rules! __enum_keyword_inner {
($name:ident, $variant:ident) => {
$name::$variant
};
($name:ident, $variant:ident, $s:ident, $v2:ident, $($subk:expr => $subv:ident),*) => {{
let sv = $s.get_subtag(1).and_then(|st| {
match st.as_str() {
$(
$subk => Some($v2::$subv),
)*
_ => None,
}
});
$name::$variant(sv)
}};
}
/// Macro used to generate a preference keyword as an enum.
///
/// The macro supports single and two subtag enums.
///
/// # Examples
///
/// ```
/// use icu::locale::preferences::extensions::unicode::enum_keyword;
///
/// enum_keyword!(
/// EmojiPresentationStyle {
/// ("emoji" => Emoji),
/// ("text" => Text),
/// ("default" => Default)
/// }, "em");
///
/// enum_keyword!(
/// MetaKeyword {
/// ("normal" => Normal),
/// ("emoji" => Emoji(EmojiPresentationStyle) {
/// ("emoji" => Emoji),
/// ("text" => Text),
/// ("default" => Default)
/// })
/// }, "mk");
/// ```
#[macro_export]
#[doc(hidden)]
macro_rules! __enum_keyword {
(
$(#[$doc:meta])*
$([$derive_attrs:ty])?
$name:ident {
$(
$(#[$variant_doc:meta])*
$([$variant_attr:ty])?
$variant:ident $($v2:ident)?
),*
}
) => {
#[non_exhaustive]
#[derive(Debug, Clone, Eq, PartialEq, Copy, Hash)]
$(#[derive($derive_attrs)])?
$(#[$doc])*
pub enum $name {
$(
$(#[$variant_doc])*
$(#[$variant_attr])?
$variant $((Option<$v2>))?
),*
}
};
($(#[$doc:meta])*
$([$derive_attrs:ty])?
$name:ident {
$(
$(#[$variant_doc:meta])*
$([$variant_attr:ty])?
($key:expr => $variant:ident $(($v2:ident) {
$(
($subk:expr => $subv:ident)
),*
})?)
),* $(,)?
},
$ext_key:literal
$(, $input:ident, $aliases:stmt)?
) => {
$crate::__enum_keyword!(
$(#[$doc])*
$([$derive_attrs])?
$name {
$(
$(#[$variant_doc])*
$([$variant_attr])?
$variant $($v2)?
),*
}
);
impl $crate::preferences::PreferenceKey for $name {
fn unicode_extension_key() -> Option<$crate::extensions::unicode::Key> {
Some($crate::extensions::unicode::key!($ext_key))
}
fn try_from_key_value(
key: &$crate::extensions::unicode::Key,
value: &$crate::extensions::unicode::Value,
) -> Result<Option<Self>, $crate::preferences::extensions::unicode::errors::PreferencesParseError> {
if Self::unicode_extension_key() == Some(*key) {
Self::try_from(value).map(Some)
} else {
Ok(None)
}
}
fn unicode_extension_value(&self) -> Option<$crate::extensions::unicode::Value> {
Some((*self).into())
}
}
impl TryFrom<&$crate::extensions::unicode::Value> for $name {
type Error = $crate::preferences::extensions::unicode::errors::PreferencesParseError;
fn try_from(s: &$crate::extensions::unicode::Value) -> Result<Self, Self::Error> {
let subtag = s.get_subtag(0)
// No subtag is equivalent to the "true" value.
.unwrap_or(&$crate::subtags::subtag!("true"));
#[allow(unused_imports)]
use $crate::extensions::unicode::value;
$(
let $input = s;
$aliases
)?
Ok(match subtag.as_str() {
$(
$key => {
$crate::__enum_keyword_inner!($name, $variant$(, s, $v2, $($subk => $subv),*)?)
}
)*
_ => {
return Err(Self::Error::InvalidKeywordValue);
}
})
}
}
impl From<$name> for $crate::extensions::unicode::Value {
fn from(input: $name) -> $crate::extensions::unicode::Value {
let f;
#[allow(unused_mut)]
let mut s = None;
match input {
$(
// This is circumventing a limitation of the macro_rules - we need to have a conditional
// $()? case here for when the variant has a value, and macro_rules require us to
// reference the $v2 inside it, but in match case it becomes a variable, so clippy
// complaints.
#[allow(non_snake_case)]
$name::$variant $(($v2))? => {
f = $crate::subtags::subtag!($key);
$(
if let Some(v2) = $v2 {
match v2 {
$(
$v2::$subv => s = Some($crate::subtags::subtag!($subk)),
)*
}
}
)?
},
)*
}
if let Some(s) = s {
$crate::extensions::unicode::Value::from_two_subtags(f, s)
} else {
$crate::extensions::unicode::Value::from_subtag(Some(f))
}
}
}
impl $name {
/// A helper function for displaying as a `&str`.
pub const fn as_str(&self) -> &'static str {
match self {
$(
// This is circumventing a limitation of the macro_rules - we need to have a conditional
// $()? case here for when the variant has a value, and macro_rules require us to
// reference the $v2 inside it, but in match case it becomes a variable, so clippy
// complaints.
#[allow(non_snake_case)]
Self::$variant $(($v2))? => {
$(
if let Some(v2) = $v2 {
return match v2 {
$(
$v2::$subv => concat!($key, '-', $subk),
)*
};
}
)?
return $key;
},
)*
}
}
}
};
}
pub use __enum_keyword as enum_keyword;
#[cfg(test)]
mod tests {
use super::*;
use crate::extensions::unicode;
use core::str::FromStr;
#[test]
fn enum_keywords_test() {
enum_keyword!(DummyKeyword {
("standard" => Standard),
("rare" => Rare),
}, "dk");
let v = unicode::Value::from_str("standard").unwrap();
let dk = DummyKeyword::try_from(&v).unwrap();
assert_eq!(dk, DummyKeyword::Standard);
assert_eq!(unicode::Value::from(dk), v);
let v = unicode::Value::from_str("rare").unwrap();
let dk = DummyKeyword::try_from(&v).unwrap();
assert_eq!(dk, DummyKeyword::Rare);
assert_eq!(unicode::Value::from(dk), v);
let v = unicode::Value::from_str("foo").unwrap();
let dk = DummyKeyword::try_from(&v);
assert!(dk.is_err());
assert_eq!(DummyKeyword::Standard.as_str(), "standard");
}
#[test]
fn enum_keywords_test_alias() {
enum_keyword!(DummyKeyword {
("standard" => Standard),
("rare" => Rare),
}, "dk", s, if *s == value!("std") { return Ok(Self::Standard) });
let v = unicode::Value::from_str("standard").unwrap();
let dk = DummyKeyword::try_from(&v).unwrap();
assert_eq!(dk, DummyKeyword::Standard);
assert_eq!(unicode::Value::from(dk), v);
let v_alias = unicode::Value::from_str("std").unwrap();
let dk = DummyKeyword::try_from(&v_alias).unwrap();
assert_eq!(dk, DummyKeyword::Standard);
assert_eq!(unicode::Value::from(dk), v);
let v = unicode::Value::from_str("rare").unwrap();
let dk = DummyKeyword::try_from(&v).unwrap();
assert_eq!(dk, DummyKeyword::Rare);
assert_eq!(unicode::Value::from(dk), v);
let v = unicode::Value::from_str("foo").unwrap();
let dk = DummyKeyword::try_from(&v);
assert!(dk.is_err());
assert_eq!(DummyKeyword::Standard.as_str(), "standard");
}
#[test]
fn enum_keywords_nested_test() {
enum_keyword!(DummySubKeyword { Standard, Rare });
enum_keyword!(DummyKeyword {
("default" => Default),
("sub" => Sub(DummySubKeyword) {
("standard" => Standard),
("rare" => Rare)
})
}, "dk");
let v = unicode::Value::from_str("default").unwrap();
let dk = DummyKeyword::try_from(&v).unwrap();
assert_eq!(dk, DummyKeyword::Default);
assert_eq!(unicode::Value::from(dk), v);
let v = unicode::Value::from_str("sub").unwrap();
let dk = DummyKeyword::try_from(&v).unwrap();
assert_eq!(dk, DummyKeyword::Sub(None));
assert_eq!(unicode::Value::from(dk), v);
let v = unicode::Value::from_str("foo").unwrap();
let dk = DummyKeyword::try_from(&v);
assert!(dk.is_err());
let v = unicode::Value::from_str("sub-standard").unwrap();
let dk = DummyKeyword::try_from(&v).unwrap();
assert_eq!(dk, DummyKeyword::Sub(Some(DummySubKeyword::Standard)));
assert_eq!(unicode::Value::from(dk), v);
let v = unicode::Value::from_str("sub-rare").unwrap();
let dk = DummyKeyword::try_from(&v).unwrap();
assert_eq!(dk, DummyKeyword::Sub(Some(DummySubKeyword::Rare)));
assert_eq!(unicode::Value::from(dk), v);
let v = unicode::Value::from_str("sub-foo").unwrap();
let dk = DummyKeyword::try_from(&v).unwrap();
assert_eq!(dk, DummyKeyword::Sub(None));
assert_eq!(unicode::Value::from(dk), unicode::value!("sub"));
assert_eq!(
DummyKeyword::Sub(Some(DummySubKeyword::Rare)).as_str(),
"sub-rare"
);
}
}

View File

@@ -0,0 +1,11 @@
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
mod enum_keyword;
mod struct_keyword;
#[doc(inline)]
pub use enum_keyword::enum_keyword;
#[doc(inline)]
pub use struct_keyword::struct_keyword;

View File

@@ -0,0 +1,124 @@
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
/// Macro used to generate a preference keyword as a struct.
///
/// # Examples
///
/// ```
/// use icu::locale::{
/// extensions::unicode::{Key, Value},
/// preferences::extensions::unicode::struct_keyword,
/// };
///
/// struct_keyword!(
/// CurrencyType,
/// "cu",
/// String,
/// |input: Value| { Ok(Self(input.to_string())) },
/// |input: CurrencyType| {
/// icu::locale::extensions::unicode::Value::try_from_str(
/// input.0.as_str(),
/// )
/// .unwrap()
/// }
/// );
/// ```
#[macro_export]
#[doc(hidden)]
macro_rules! __struct_keyword {
($(#[$doc:meta])* $([$derive_attrs:ty])? $name:ident, $ext_key:literal, $value:ty, $try_from:expr, $into:expr) => {
$(#[$doc])*
#[derive(Debug, Clone, Eq, PartialEq, Hash)]
$(#[derive($derive_attrs)])?
#[allow(clippy::exhaustive_structs)] // TODO
pub struct $name($value);
impl TryFrom<$crate::extensions::unicode::Value> for $name {
type Error = $crate::preferences::extensions::unicode::errors::PreferencesParseError;
fn try_from(
input: $crate::extensions::unicode::Value,
) -> Result<Self, Self::Error> {
$try_from(input)
}
}
impl From<$name> for $crate::extensions::unicode::Value {
fn from(input: $name) -> $crate::extensions::unicode::Value {
$into(input)
}
}
impl $crate::preferences::PreferenceKey for $name {
fn unicode_extension_key() -> Option<$crate::extensions::unicode::Key> {
Some($crate::extensions::unicode::key!($ext_key))
}
fn try_from_key_value(
key: &$crate::extensions::unicode::Key,
value: &$crate::extensions::unicode::Value,
) -> Result<Option<Self>, $crate::preferences::extensions::unicode::errors::PreferencesParseError> {
if Self::unicode_extension_key() == Some(*key) {
let result = Self::try_from(value.clone())?;
Ok(Some(result))
} else {
Ok(None)
}
}
fn unicode_extension_value(
&self,
) -> Option<$crate::extensions::unicode::Value> {
Some(self.clone().into())
}
}
impl core::ops::Deref for $name {
type Target = $value;
fn deref(&self) -> &Self::Target {
&self.0
}
}
};
}
pub use __struct_keyword as struct_keyword;
#[cfg(test)]
mod tests {
use super::*;
use crate::{
extensions::unicode,
subtags::{subtag, Subtag},
};
use core::str::FromStr;
#[test]
fn struct_keywords_test() {
struct_keyword!(
DummyKeyword,
"dk",
Subtag,
|input: unicode::Value| {
if let Some(subtag) = input.into_single_subtag() {
if subtag.len() == 3 {
return Ok(DummyKeyword(subtag));
}
}
Err(crate::preferences::extensions::unicode::errors::PreferencesParseError::InvalidKeywordValue)
},
|input: DummyKeyword| { unicode::Value::from_subtag(Some(input.0)) }
);
let v = unicode::Value::from_str("foo").unwrap();
let dk: DummyKeyword = v.clone().try_into().unwrap();
assert_eq!(dk, DummyKeyword(subtag!("foo")));
assert_eq!(unicode::Value::from(dk), v);
let v = unicode::Value::from_str("foobar").unwrap();
let dk: Result<DummyKeyword, _> = v.clone().try_into();
assert!(dk.is_err());
}
}

View File

@@ -0,0 +1,17 @@
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
//! A set of unicode extensions which correspond to preferences.
//!
//! The module contains a set structs corresponding to Locale [`unicode`](crate::extensions::unicode)
//! extensions for which ICU4X provides implementations of preferences.
//!
//! The macros in this module provide wrappers for creating preferences based on enums and structs.
//!
//! [`Locale`]: crate::Locale
pub mod errors;
pub mod keywords;
mod macros;
#[doc(inline)]
pub use macros::*;

View File

@@ -0,0 +1,181 @@
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
#[cfg(feature = "alloc")]
use crate::subtags::Variants;
use crate::subtags::{Language, Region, Script, Subtag, Variant};
use crate::DataLocale;
/// The structure storing locale subtags used in preferences.
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
pub struct LocalePreferences {
/// Preference of Language
pub(crate) language: Language,
/// Preference of Script
pub(crate) script: Option<Script>,
/// Preference of Region
pub(crate) region: Option<Region>,
/// Preference of Variant
pub(crate) variant: Option<Variant>,
/// Preference of Regional Subdivision
pub(crate) subdivision: Option<Subtag>,
/// Preference of Unicode Extension Region
pub(crate) ue_region: Option<Region>,
}
impl LocalePreferences {
fn to_data_locale_maybe_region_priority(self, region_priority: bool) -> DataLocale {
DataLocale {
language: self.language,
script: self.script,
region: match (self.region, self.ue_region) {
(Some(_), Some(r)) if region_priority => Some(r),
(r, _) => r,
},
variant: self.variant,
subdivision: self.subdivision,
}
}
/// Convert to a DataLocale, with region-based fallback priority
///
/// Most users should use `icu_provider::marker::make_locale()` instead.
pub fn to_data_locale_region_priority(self) -> DataLocale {
self.to_data_locale_maybe_region_priority(true)
}
/// Convert to a DataLocale, with language-based fallback priority
///
/// Most users should use `icu_provider::marker::make_locale()` instead.
pub fn to_data_locale_language_priority(self) -> DataLocale {
self.to_data_locale_maybe_region_priority(false)
}
}
impl Default for LocalePreferences {
fn default() -> Self {
Self::default()
}
}
impl From<&crate::Locale> for LocalePreferences {
fn from(loc: &crate::Locale) -> Self {
let sd = loc
.extensions
.unicode
.keywords
.get(&crate::extensions::unicode::key!("sd"))
.and_then(|v| v.as_single_subtag().copied());
let ue_region = loc
.extensions
.unicode
.keywords
.get(&crate::extensions::unicode::key!("rg"))
.and_then(|v| {
v.as_single_subtag()
.and_then(|s| Region::try_from_str(s.as_str()).ok())
});
Self {
language: loc.id.language,
script: loc.id.script,
region: loc.id.region,
variant: loc.id.variants.iter().copied().next(),
subdivision: sd,
ue_region,
}
}
}
impl From<&crate::LanguageIdentifier> for LocalePreferences {
fn from(lid: &crate::LanguageIdentifier) -> Self {
Self {
language: lid.language,
script: lid.script,
region: lid.region,
variant: lid.variants.iter().copied().next(),
subdivision: None,
ue_region: None,
}
}
}
/// ✨ *Enabled with the `alloc` Cargo feature.*
#[cfg(feature = "alloc")]
impl From<LocalePreferences> for crate::Locale {
fn from(prefs: LocalePreferences) -> Self {
Self {
id: crate::LanguageIdentifier {
language: prefs.language,
script: prefs.script,
region: prefs.region,
variants: prefs
.variant
.map(Variants::from_variant)
.unwrap_or_default(),
},
extensions: {
let mut extensions = crate::extensions::Extensions::default();
if let Some(sd) = prefs.subdivision {
extensions.unicode.keywords.set(
crate::extensions::unicode::key!("sd"),
crate::extensions::unicode::Value::from_subtag(Some(sd)),
);
}
if let Some(rg) = prefs.ue_region {
#[expect(clippy::unwrap_used)] // Region is a valid Subtag
extensions.unicode.keywords.set(
crate::extensions::unicode::key!("rg"),
crate::extensions::unicode::Value::try_from_str(rg.as_str()).unwrap(),
);
}
extensions
},
}
}
}
impl LocalePreferences {
/// Constructs a new [`LocalePreferences`] struct with the defaults.
pub const fn default() -> Self {
Self {
language: Language::UNKNOWN,
script: None,
region: None,
variant: None,
subdivision: None,
ue_region: None,
}
}
/// Preference of Language
pub const fn language(&self) -> Language {
self.language
}
/// Preference of Region
pub const fn region(&self) -> Option<Region> {
self.region
}
/// Extends the preferences with the values from another set of preferences.
pub fn extend(&mut self, other: LocalePreferences) {
if !other.language.is_unknown() {
self.language = other.language;
}
if let Some(script) = other.script {
self.script = Some(script);
}
if let Some(region) = other.region {
self.region = Some(region);
}
if let Some(variant) = other.variant {
self.variant = Some(variant);
}
if let Some(sd) = other.subdivision {
self.subdivision = Some(sd);
}
if let Some(ue_region) = other.ue_region {
self.ue_region = Some(ue_region);
}
}
}

View File

@@ -0,0 +1,634 @@
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
//! This API provides necessary functionality for building user preferences structs.
//!
//! It includes the ability to merge information between the struct and a [`Locale`],
//! facilitating the resolution of attributes against default values.
//!
//! Preferences struct serve as a composable argument to `ICU4X` constructors, allowing
//! for ergonomic merging between information encoded in multiple sets of user inputs:
//! Locale, application preferences and operating system preferences.
//!
//! The crate is intended primarily to be used by components constructors to normalize the format
//! of ingesting preferences across all of `ICU4X`.
//!
//! # Preferences vs Options
//!
//! ICU4X introduces a separation between two classes of parameters that are used
//! to adjust the behavior of a component.
//!
//! `Preferences` represent the user-driven preferences on how the given user wants the internationalization
//! to behave. Those are items like language, script, calendar and numbering systems etc.
//!
//! `Options` represent the developer-driven adjustments that affect how given information is presented
//! based on the requirements of the application like available space or intended tone.
//!
//! # Options Division
//!
//! The `Options` themselves are also divided into options that are affecting data slicing, and ones that don't.
//! This is necessary to allow for DCE and FFI to produce minimal outputs avoiding loading unnecessary data that
//! is never to be used by a given component.
//! The result is that some option keys affect specialized constructors such as `try_new_short`, `try_new_long`, which
//! result in data provider loading only data necessary to format short or long values respectively.
//! For options that are not affecting data slicing, an `Options` struct is provided that the developer
//! can fill with selected key values, or use the defaults.
//!
//! # Preferences Merging
//!
//! In traditional internationalization APIs, the argument passed to constructors is a locale.
//! ICU4X changes this paradigm by accepting a `Preferences`, which can be extracted from a [`Locale`] and combined with
//! other `Preferences`s provided by the environment.
//!
//! This approach makes it easy for developers to write code that takes just a locale, as in other systems,
//! as well as handle more sophisticated cases where the application may receive, for example, a locale,
//! a set of internationalization preferences specified within the application,
//! and a third set extracted from the operating system's preferences.
//!
//! # ECMA-402 vs ICU4X
//!
//! The result of the two paradigm shifts presented above is that the way constructors work is different.
//!
//! ## ECMA-402
//! ```ignore
//! let locale = new Locale("en-US-u-hc-h12");
//! let options = {
//! hourCycle: "h24", // user preference
//! timeStyle: "long", // developer option
//! };
//!
//! let dtf = new DateTimeFormat(locale, options);
//! ```
//!
//! ## ICU4X
//! ```ignore
//! let loc = locale!("en-US-u-hc-h12");
//! let prefs = DateTimeFormatterPreferences {
//! hour_cycle: HourCycle::H23,
//! };
//! let options = DateTimeFormatterOptions {
//! time_style: TimeStyle::Long,
//! };
//!
//! let mut combined_prefs = DateTimeFormatterPreferences::from(loc);
//! combined_prefs.extend(prefs);
//!
//! let dtf = DateTimeFormatter::try_new(combined_prefs, options);
//! ```
//!
//! This architecture allows for flexible composition of user and developer settings
//! sourced from different locations in custom ways based on the needs of each deployment.
//!
//! Below are some examples of how the `Preferences` model can be used in different setups.
//!
//! # Examples
//!
//! ```
//! use icu::locale::preferences::{
//! define_preferences,
//! extensions::unicode::keywords::HourCycle,
//! };
//! use icu::locale::locale;
//!
//! # fn get_data_locale_from_prefs(input: ExampleComponentPreferences) -> () { () }
//! # fn load_data(locale: ()) -> MyData { MyData {} }
//! # struct MyData {}
//! define_preferences!(
//! /// Name of the preferences struct
//! [Copy]
//! ExampleComponentPreferences,
//! {
//! /// A preference relevant to the component
//! hour_cycle: HourCycle
//! }
//! );
//!
//! pub struct ExampleComponent {
//! data: MyData,
//! }
//!
//! impl ExampleComponent {
//! pub fn new(prefs: ExampleComponentPreferences) -> Self {
//! let locale = get_data_locale_from_prefs(prefs);
//! let data = load_data(locale);
//!
//! Self { data }
//! }
//! }
//! ```
//!
//! Now we can use that component in multiple different ways,
//!
//! ## Scenario 1: Use Locale as the only input
//! ```
//! # use icu::locale::preferences::{
//! # define_preferences,
//! # extensions::unicode::keywords::HourCycle,
//! # };
//! # use icu::locale::locale;
//! # fn get_data_locale_from_prefs(input: ExampleComponentPreferences) -> () { () }
//! # fn load_data(locale: ()) -> MyData { MyData {} }
//! # struct MyData {}
//! # define_preferences!(
//! # /// Name of the preferences struct
//! # [Copy]
//! # ExampleComponentPreferences,
//! # {
//! # /// A preference relevant to the component
//! # hour_cycle: HourCycle
//! # }
//! # );
//! #
//! # pub struct ExampleComponent {
//! # data: MyData,
//! # }
//! # impl ExampleComponent {
//! # pub fn new(prefs: ExampleComponentPreferences) -> Self {
//! # let locale = get_data_locale_from_prefs(prefs);
//! # let data = load_data(locale);
//! #
//! # Self { data }
//! # }
//! # }
//! let loc = locale!("en-US-u-hc-h23");
//! let tf = ExampleComponent::new(loc.into());
//! ```
//!
//! ## Scenario 2: Compose Preferences and Locale
//! ```
//! # use icu::locale::preferences::{
//! # define_preferences,
//! # extensions::unicode::keywords::HourCycle,
//! # };
//! # use icu::locale::locale;
//! # fn get_data_locale_from_prefs(input: ExampleComponentPreferences) -> () { () }
//! # fn load_data(locale: ()) -> MyData { MyData {} }
//! # struct MyData {}
//! # define_preferences!(
//! # /// Name of the preferences struct
//! # [Copy]
//! # ExampleComponentPreferences,
//! # {
//! # /// A preference relevant to the component
//! # hour_cycle: HourCycle
//! # }
//! # );
//! #
//! # pub struct ExampleComponent {
//! # data: MyData,
//! # }
//! # impl ExampleComponent {
//! # pub fn new(prefs: ExampleComponentPreferences) -> Self {
//! # let locale = get_data_locale_from_prefs(prefs);
//! # let data = load_data(locale);
//! #
//! # Self { data }
//! # }
//! # }
//! let loc = locale!("en-US-u-hc-h23");
//! let app_prefs = ExampleComponentPreferences {
//! hour_cycle: Some(HourCycle::H12),
//! ..Default::default()
//! };
//!
//! let mut combined_prefs = ExampleComponentPreferences::from(loc);
//! combined_prefs.extend(app_prefs);
//!
//! // HourCycle is set from the prefs bag and override the value from the locale
//! assert_eq!(combined_prefs.hour_cycle, Some(HourCycle::H12));
//!
//! let tf = ExampleComponent::new(combined_prefs);
//! ```
//!
//! ## Scenario 3: Merge Preferences from Locale, OS, and Application
//! ```
//! # use icu::locale::preferences::{
//! # define_preferences,
//! # extensions::unicode::keywords::HourCycle,
//! # };
//! # use icu::locale::locale;
//! # fn get_data_locale_from_prefs(input: ExampleComponentPreferences) -> () { () }
//! # fn load_data(locale: ()) -> MyData { MyData {} }
//! # struct MyData {}
//! # define_preferences!(
//! # /// Name of the preferences struct
//! # [Copy]
//! # ExampleComponentPreferences,
//! # {
//! # /// A preference relevant to the component
//! # hour_cycle: HourCycle
//! # }
//! # );
//! #
//! # pub struct ExampleComponent {
//! # data: MyData,
//! # }
//! # impl ExampleComponent {
//! # pub fn new(prefs: ExampleComponentPreferences) -> Self {
//! # let locale = get_data_locale_from_prefs(prefs);
//! # let data = load_data(locale);
//! #
//! # Self { data }
//! # }
//! # }
//! let loc = locale!("en-US");
//!
//! // Simulate OS preferences
//! let os_prefs = ExampleComponentPreferences {
//! hour_cycle: Some(HourCycle::H23),
//! ..Default::default()
//! };
//!
//! // Application does not specify hour_cycle
//! let app_prefs = ExampleComponentPreferences {
//! hour_cycle: None,
//! ..Default::default()
//! };
//!
//! let mut combined_prefs = ExampleComponentPreferences::from(loc);
//! combined_prefs.extend(os_prefs);
//! combined_prefs.extend(app_prefs);
//!
//! // HourCycle is set from the OS preferences since the application didn't specify it
//! assert_eq!(combined_prefs.hour_cycle, Some(HourCycle::H23));
//!
//! let tf = ExampleComponent::new(combined_prefs);
//! ```
//!
//! ## Scenario 4: Neither Application nor OS specify the preference
//! ```
//! # use icu::locale::preferences::{
//! # define_preferences,
//! # extensions::unicode::keywords::HourCycle,
//! # };
//! # use icu::locale::locale;
//! # fn get_data_locale_from_prefs(input: ExampleComponentPreferences) -> () { () }
//! # fn load_data(locale: ()) -> MyData { MyData {} }
//! # struct MyData {}
//! # define_preferences!(
//! # /// Name of the preferences struct
//! # [Copy]
//! # ExampleComponentPreferences,
//! # {
//! # /// A preference relevant to the component
//! # hour_cycle: HourCycle
//! # }
//! # );
//! #
//! # pub struct ExampleComponent {
//! # data: MyData,
//! # }
//! # impl ExampleComponent {
//! # pub fn new(prefs: ExampleComponentPreferences) -> Self {
//! # let locale = get_data_locale_from_prefs(prefs);
//! # let data = load_data(locale);
//! #
//! # Self { data }
//! # }
//! # }
//! let loc = locale!("en-US-u-hc-h23");
//!
//! // Simulate OS preferences
//! let os_prefs = ExampleComponentPreferences::default(); // OS does not specify hour_cycle
//! let app_prefs = ExampleComponentPreferences::default(); // Application does not specify hour_cycle
//!
//! let mut combined_prefs = ExampleComponentPreferences::from(loc);
//! combined_prefs.extend(os_prefs);
//! combined_prefs.extend(app_prefs);
//!
//! // HourCycle is taken from the locale
//! assert_eq!(combined_prefs.hour_cycle, Some(HourCycle::H23));
//!
//! let tf = ExampleComponent::new(combined_prefs);
//! ```
//!
//! [`ICU4X`]: ../icu/index.html
//! [`Locale`]: crate::Locale
pub mod extensions;
mod locale;
pub use locale::*;
/// A low-level trait implemented on each preference exposed in component preferences.
///
/// [`PreferenceKey`] has to be implemented on
/// preferences that are to be included in Formatter preferences.
/// The trait may be implemented to indicate that the given preference has
/// a unicode key corresponding to it or be a custom one.
///
/// `ICU4X` provides an implementation of [`PreferenceKey`] for all
/// Unicode Extension Keys. The only external use of this trait is to implement
/// it on custom preferences that are to be included in a component preferences bag.
///
/// The below example show cases a manual generation of an `em` (emoji) unicode extension key
/// and a custom struct to showcase the difference in their behavior. For all use purposes,
/// the [`EmojiPresentationStyle`](crate::preferences::extensions::unicode::keywords::EmojiPresentationStyle) preference exposed by this crate should be used.
///
/// # Examples
/// ```
/// use icu::locale::{
/// extensions::unicode::{key, Key, value, Value},
/// preferences::{
/// define_preferences, PreferenceKey,
/// extensions::unicode::errors::PreferencesParseError,
/// },
/// };
///
/// #[non_exhaustive]
/// #[derive(Debug, Clone, Eq, PartialEq, Copy, Hash, Default)]
/// pub enum EmojiPresentationStyle {
/// Emoji,
/// Text,
/// #[default]
/// Default,
/// }
///
/// impl PreferenceKey for EmojiPresentationStyle {
/// fn unicode_extension_key() -> Option<Key> {
/// Some(key!("em"))
/// }
///
/// fn try_from_key_value(
/// key: &Key,
/// value: &Value,
/// ) -> Result<Option<Self>, PreferencesParseError> {
/// if Self::unicode_extension_key() == Some(*key) {
/// let subtag = value.as_single_subtag()
/// .ok_or(PreferencesParseError::InvalidKeywordValue)?;
/// match subtag.as_str() {
/// "emoji" => Ok(Some(Self::Emoji)),
/// "text" => Ok(Some(Self::Text)),
/// "default" => Ok(Some(Self::Default)),
/// _ => Err(PreferencesParseError::InvalidKeywordValue)
/// }
/// } else {
/// Ok(None)
/// }
/// }
///
/// fn unicode_extension_value(&self) -> Option<Value> {
/// Some(match self {
/// EmojiPresentationStyle::Emoji => value!("emoji"),
/// EmojiPresentationStyle::Text => value!("text"),
/// EmojiPresentationStyle::Default => value!("default"),
/// })
/// }
/// }
///
/// #[non_exhaustive]
/// #[derive(Debug, Clone, Eq, PartialEq, Hash)]
/// pub struct CustomFormat {
/// value: String
/// }
///
/// impl PreferenceKey for CustomFormat {}
///
/// define_preferences!(
/// MyFormatterPreferences,
/// {
/// emoji: EmojiPresentationStyle,
/// custom: CustomFormat
/// }
/// );
/// ```
/// [`ICU4X`]: ../icu/index.html
pub trait PreferenceKey: Sized {
/// Optional constructor of the given preference. It takes the
/// unicode extension key and if the key matches it attemptes to construct
/// the preference based on the given value.
/// If the value is not a valid value for the given key, the constructor throws.
fn try_from_key_value(
_key: &crate::extensions::unicode::Key,
_value: &crate::extensions::unicode::Value,
) -> Result<Option<Self>, crate::preferences::extensions::unicode::errors::PreferencesParseError>
{
Ok(None)
}
/// Retrieve unicode extension key corresponding to a given preference.
fn unicode_extension_key() -> Option<crate::extensions::unicode::Key> {
None
}
/// Retrieve unicode extension value corresponding to the given instance of the preference.
fn unicode_extension_value(&self) -> Option<crate::extensions::unicode::Value> {
None
}
}
/// A macro to facilitate generation of preferences struct.
///
///
/// The generated preferences struct provides methods for merging and converting between [`Locale`] and
/// the preference bag. See [`preferences`](crate::preferences) for use cases.
///
/// In the example below, the input argument is the generated preferences struct which
/// can be auto-converted from a Locale, or combined from a Locale and Preferences Bag.
///
/// # Examples
/// ```
/// use icu::locale::{
/// preferences::{
/// define_preferences,
/// extensions::unicode::keywords::HourCycle
/// },
/// locale,
/// };
///
/// define_preferences!(
/// [Copy]
/// NoCalendarFormatterPreferences,
/// {
/// hour_cycle: HourCycle
/// }
/// );
///
/// struct NoCalendarFormatter {}
///
/// impl NoCalendarFormatter {
/// pub fn try_new(prefs: NoCalendarFormatterPreferences) -> Result<Self, ()> {
/// // load data and set struct fields based on the prefs input
/// Ok(Self {})
/// }
/// }
///
/// let loc = locale!("en-US");
///
/// let tf = NoCalendarFormatter::try_new(loc.into());
/// ```
///
/// [`Locale`]: crate::Locale
#[macro_export]
#[doc(hidden)]
macro_rules! __define_preferences {
(
$(#[$doc:meta])*
$([$derive_attrs:ty])?
$name:ident,
{
$(
$(#[$key_doc:meta])*
$key:ident: $pref:ty
),*
}
) => (
$(#[$doc])*
#[derive(Default, Debug, Clone, PartialEq, Eq, Hash)]
$(#[derive($derive_attrs)])?
#[non_exhaustive]
pub struct $name {
/// Locale Preferences for the Preferences structure.
pub locale_preferences: $crate::preferences::LocalePreferences,
$(
$(#[$key_doc])*
pub $key: Option<$pref>,
)*
}
impl From<$crate::Locale> for $name {
fn from(loc: $crate::Locale) -> Self {
$name::from(&loc)
}
}
impl From<&$crate::Locale> for $name {
fn from(loc: &$crate::Locale) -> Self {
$name::from_locale_strict(loc).unwrap_or_else(|e| e)
}
}
impl From<$crate::LanguageIdentifier> for $name {
fn from(lid: $crate::LanguageIdentifier) -> Self {
$name::from(&lid)
}
}
impl From<&$crate::LanguageIdentifier> for $name {
fn from(lid: &$crate::LanguageIdentifier) -> Self {
Self {
locale_preferences: lid.into(),
$(
$key: None,
)*
}
}
}
// impl From<$name> for $crate::Locale {
// fn from(other: $name) -> Self {
// use $crate::preferences::PreferenceKey;
// let mut result = Self::from(other.locale_preferences);
// $(
// if let Some(value) = other.$key {
// if let Some(ue) = <$pref>::unicode_extension_key() {
// let val = value.unicode_extension_value().unwrap();
// result.extensions.unicode.keywords.set(ue, val);
// }
// }
// )*
// result
// }
// }
impl $name {
/// Extends the preferences with the values from another set of preferences.
pub fn extend(&mut self, other: $name) {
self.locale_preferences.extend(other.locale_preferences);
$(
if let Some(value) = other.$key {
self.$key = Some(value);
}
)*
}
#[doc = concat!("Construct a `", stringify!($name), "` from a `Locale`")]
///
/// Returns `Err` if any of of the preference values are invalid.
pub fn from_locale_strict(loc: &$crate::Locale) -> Result<Self, Self> {
use $crate::preferences::PreferenceKey;
let mut is_err = false;
$(
let mut $key = None;
)*
for (k, v) in loc.extensions.unicode.keywords.iter() {
$(
match <$pref>::try_from_key_value(k, v) {
Ok(Some(k)) => {
$key = Some(k);
continue;
}
Ok(None) => {}
Err(_) => {
is_err = true
}
}
)*
}
let r = Self {
locale_preferences: loc.into(),
$(
$key,
)*
};
if is_err {
Err(r)
} else {
Ok(r)
}
}
}
)
}
#[macro_export]
#[doc(hidden)]
macro_rules! __prefs_convert {
(
$name1:ident,
$name2:ident
) => {
impl From<&$name1> for $name2 {
fn from(other: &$name1) -> Self {
let mut result = Self::default();
result.locale_preferences = other.locale_preferences;
result
}
}
};
(
$name1:ident,
$name2:ident,
{
$(
$key:ident
),*
}
) => {
impl From<&$name1> for $name2 {
fn from(other: &$name1) -> Self {
let mut result = Self::default();
result.locale_preferences = other.locale_preferences;
$(
result.$key = other.$key;
)*
result
}
}
};
}
#[doc(inline)]
pub use __define_preferences as define_preferences;
#[doc(inline)]
pub use __prefs_convert as prefs_convert;

197
vendor/icu_locale_core/src/serde.rs vendored Normal file
View File

@@ -0,0 +1,197 @@
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
use crate::{LanguageIdentifier, Locale};
use core::{fmt::Display, marker::PhantomData, str::FromStr};
use serde::{Deserialize, Deserializer, Serialize, Serializer};
use writeable::Writeable;
impl Serialize for LanguageIdentifier {
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
where
S: Serializer,
{
serializer.serialize_str(&self.write_to_string())
}
}
impl Serialize for Locale {
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
where
S: Serializer,
{
serializer.serialize_str(&self.write_to_string())
}
}
struct ParseVisitor<T>(PhantomData<T>);
impl<T> serde::de::Visitor<'_> for ParseVisitor<T>
where
T: FromStr,
<T as FromStr>::Err: Display,
{
type Value = T;
fn expecting(&self, formatter: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
write!(formatter, "a valid Unicode Language or Locale Identifier")
}
fn visit_str<E>(self, s: &str) -> Result<Self::Value, E>
where
E: serde::de::Error,
{
s.parse::<T>().map_err(serde::de::Error::custom)
}
}
impl<'de> Deserialize<'de> for LanguageIdentifier {
fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
where
D: Deserializer<'de>,
{
deserializer.deserialize_str(ParseVisitor(PhantomData))
}
}
impl<'de> Deserialize<'de> for Locale {
fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
where
D: Deserializer<'de>,
{
deserializer.deserialize_str(ParseVisitor(PhantomData))
}
}
#[test]
fn json() {
use crate::subtags::{Language, Region, Script};
use crate::{langid, locale};
assert_eq!(
serde_json::to_string(&langid!("en-US")).unwrap(),
r#""en-US""#
);
assert_eq!(
serde_json::from_str::<LanguageIdentifier>(r#""en-US""#).unwrap(),
langid!("en-US")
);
assert_eq!(
serde_json::from_reader::<_, LanguageIdentifier>(&br#""en-US""#[..]).unwrap(),
langid!("en-US")
);
assert!(serde_json::from_str::<LanguageIdentifier>(r#""2Xs""#).is_err());
assert_eq!(
serde_json::to_string(&locale!("en-US-u-hc-h12")).unwrap(),
r#""en-US-u-hc-h12""#
);
assert_eq!(
serde_json::from_str::<Locale>(r#""en-US-u-hc-h12""#).unwrap(),
locale!("en-US-u-hc-h12")
);
assert_eq!(
serde_json::from_reader::<_, Locale>(&br#""en-US-u-hc-h12""#[..]).unwrap(),
locale!("en-US-u-hc-h12")
);
assert!(serde_json::from_str::<Locale>(r#""2Xs""#).is_err());
assert_eq!(
serde_json::to_string(&"fr".parse::<Language>().unwrap()).unwrap(),
r#""fr""#
);
assert_eq!(
serde_json::from_str::<Language>(r#""fr""#).unwrap(),
"fr".parse::<Language>().unwrap()
);
assert_eq!(
serde_json::from_reader::<_, Language>(&br#""fr""#[..]).unwrap(),
"fr".parse::<Language>().unwrap()
);
assert!(serde_json::from_str::<Language>(r#""2Xs""#).is_err());
assert_eq!(
serde_json::to_string(&"Latn".parse::<Script>().unwrap()).unwrap(),
r#""Latn""#
);
assert_eq!(
serde_json::from_str::<Script>(r#""Latn""#).unwrap(),
"Latn".parse::<Script>().unwrap()
);
assert_eq!(
serde_json::from_reader::<_, Script>(&br#""Latn""#[..]).unwrap(),
"Latn".parse::<Script>().unwrap()
);
assert!(serde_json::from_str::<Script>(r#""2Xs""#).is_err());
assert_eq!(
serde_json::to_string(&"US".parse::<Region>().unwrap()).unwrap(),
r#""US""#
);
assert_eq!(
serde_json::from_str::<Region>(r#""US""#).unwrap(),
"US".parse::<Region>().unwrap()
);
assert_eq!(
serde_json::from_reader::<_, Region>(&br#""US""#[..]).unwrap(),
"US".parse::<Region>().unwrap()
);
assert!(serde_json::from_str::<Region>(r#""2Xs""#).is_err());
}
#[test]
fn postcard() {
use crate::subtags::{Language, Region, Script};
use crate::{langid, locale};
assert_eq!(
postcard::to_stdvec(&langid!("en-US")).unwrap(),
b"\x05en-US"
);
assert_eq!(
postcard::from_bytes::<LanguageIdentifier>(b"\x05en-US").unwrap(),
langid!("en-US")
);
assert!(postcard::from_bytes::<LanguageIdentifier>(b"\x032Xs").is_err());
assert_eq!(
postcard::to_stdvec(&locale!("en-US-u-hc-h12")).unwrap(),
b"\x0Een-US-u-hc-h12"
);
assert_eq!(
postcard::from_bytes::<Locale>(b"\x0Een-US-u-hc-h12").unwrap(),
locale!("en-US-u-hc-h12")
);
assert!(postcard::from_bytes::<Locale>(b"\x032Xs").is_err());
assert_eq!(
postcard::to_stdvec(&"fr".parse::<Language>().unwrap()).unwrap(),
b"fr\0"
);
assert_eq!(
postcard::from_bytes::<Language>(b"fr\0").unwrap(),
"fr".parse::<Language>().unwrap()
);
assert!(postcard::from_bytes::<Language>(b"2Xs").is_err());
assert_eq!(
postcard::to_stdvec(&"Latn".parse::<Script>().unwrap()).unwrap(),
b"Latn"
);
assert_eq!(
postcard::from_bytes::<Script>(b"Latn").unwrap(),
"Latn".parse::<Script>().unwrap()
);
assert!(postcard::from_bytes::<Script>(b"2Xss").is_err());
assert_eq!(
postcard::to_stdvec(&"US".parse::<Region>().unwrap()).unwrap(),
b"US\0"
);
assert_eq!(
postcard::from_bytes::<Region>(b"US\0").unwrap(),
"US".parse::<Region>().unwrap()
);
assert!(postcard::from_bytes::<Region>(b"2Xs").is_err());
}

View File

@@ -0,0 +1,200 @@
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
use super::ShortBoxSlice;
use super::ShortBoxSliceInner;
#[cfg(feature = "alloc")]
use super::ShortBoxSliceIntoIter;
use litemap::store::*;
impl<K, V> StoreConstEmpty<K, V> for ShortBoxSlice<(K, V)> {
const EMPTY: ShortBoxSlice<(K, V)> = ShortBoxSlice::new();
}
impl<K, V> StoreSlice<K, V> for ShortBoxSlice<(K, V)> {
type Slice = [(K, V)];
#[inline]
fn lm_get_range(&self, range: core::ops::Range<usize>) -> Option<&Self::Slice> {
self.get(range)
}
}
impl<K, V> Store<K, V> for ShortBoxSlice<(K, V)> {
#[inline]
fn lm_len(&self) -> usize {
self.len()
}
#[inline]
fn lm_is_empty(&self) -> bool {
use ShortBoxSliceInner::*;
matches!(self.0, ZeroOne(None))
}
#[inline]
fn lm_get(&self, index: usize) -> Option<(&K, &V)> {
self.get(index).map(|elt| (&elt.0, &elt.1))
}
#[inline]
fn lm_last(&self) -> Option<(&K, &V)> {
use ShortBoxSliceInner::*;
match self.0 {
ZeroOne(ref v) => v.as_ref(),
#[cfg(feature = "alloc")]
Multi(ref v) => v.last(),
#[cfg(not(feature = "alloc"))]
Two([_, ref v]) => Some(v),
}
.map(|elt| (&elt.0, &elt.1))
}
#[inline]
fn lm_binary_search_by<F>(&self, mut cmp: F) -> Result<usize, usize>
where
F: FnMut(&K) -> core::cmp::Ordering,
{
self.binary_search_by(|(k, _)| cmp(k))
}
}
#[cfg(feature = "alloc")]
impl<K: Ord, V> StoreFromIterable<K, V> for ShortBoxSlice<(K, V)> {
fn lm_sort_from_iter<I: IntoIterator<Item = (K, V)>>(iter: I) -> Self {
alloc::vec::Vec::lm_sort_from_iter(iter).into()
}
}
#[cfg(feature = "alloc")]
impl<K, V> StoreMut<K, V> for ShortBoxSlice<(K, V)> {
fn lm_with_capacity(_capacity: usize) -> Self {
ShortBoxSlice::new()
}
fn lm_reserve(&mut self, _additional: usize) {}
fn lm_get_mut(&mut self, index: usize) -> Option<(&K, &mut V)> {
self.get_mut(index).map(|elt| (&elt.0, &mut elt.1))
}
fn lm_push(&mut self, key: K, value: V) {
self.push((key, value))
}
fn lm_insert(&mut self, index: usize, key: K, value: V) {
self.insert(index, (key, value))
}
fn lm_remove(&mut self, index: usize) -> (K, V) {
self.remove(index)
}
fn lm_clear(&mut self) {
self.clear();
}
}
#[cfg(feature = "alloc")]
impl<K: Ord, V> StoreBulkMut<K, V> for ShortBoxSlice<(K, V)> {
fn lm_retain<F>(&mut self, mut predicate: F)
where
F: FnMut(&K, &V) -> bool,
{
self.retain(|(k, v)| predicate(k, v))
}
fn lm_extend<I>(&mut self, other: I)
where
I: IntoIterator<Item = (K, V)>,
{
let mut other = other.into_iter();
// Use an Option to hold the first item of the map and move it to
// items if there are more items. Meaning that if items is not
// empty, first is None.
let mut first = None;
let mut items = alloc::vec::Vec::new();
match core::mem::take(&mut self.0) {
ShortBoxSliceInner::ZeroOne(zo) => {
first = zo;
// Attempt to avoid the items allocation by advancing the iterator
// up to two times. If we eventually find a second item, we can
// lm_extend the Vec and with the first, next (second) and the rest
// of the iterator.
while let Some(next) = other.next() {
if let Some(first) = first.take() {
// lm_extend will take care of sorting and deduplicating
// first, next and the rest of the other iterator.
items.lm_extend([first, next].into_iter().chain(other));
break;
}
first = Some(next);
}
}
ShortBoxSliceInner::Multi(existing_items) => {
items.reserve_exact(existing_items.len() + other.size_hint().0);
// We use a plain extend with existing items, which are already valid and
// lm_extend will fold over rest of the iterator sorting and deduplicating as needed.
items.extend(existing_items);
items.lm_extend(other);
}
}
if items.is_empty() {
debug_assert!(items.is_empty());
self.0 = ShortBoxSliceInner::ZeroOne(first);
} else {
debug_assert!(first.is_none());
self.0 = ShortBoxSliceInner::Multi(items.into_boxed_slice());
}
}
}
impl<'a, K: 'a, V: 'a> StoreIterable<'a, K, V> for ShortBoxSlice<(K, V)> {
type KeyValueIter =
core::iter::Map<core::slice::Iter<'a, (K, V)>, for<'r> fn(&'r (K, V)) -> (&'r K, &'r V)>;
fn lm_iter(&'a self) -> Self::KeyValueIter {
self.iter().map(|elt| (&elt.0, &elt.1))
}
}
#[cfg(feature = "alloc")]
impl<K, V> StoreFromIterator<K, V> for ShortBoxSlice<(K, V)> {}
#[cfg(feature = "alloc")]
impl<'a, K: 'a, V: 'a> StoreIterableMut<'a, K, V> for ShortBoxSlice<(K, V)> {
type KeyValueIterMut = core::iter::Map<
core::slice::IterMut<'a, (K, V)>,
for<'r> fn(&'r mut (K, V)) -> (&'r K, &'r mut V),
>;
fn lm_iter_mut(
&'a mut self,
) -> <Self as litemap::store::StoreIterableMut<'a, K, V>>::KeyValueIterMut {
self.iter_mut().map(|elt| (&elt.0, &mut elt.1))
}
}
#[cfg(feature = "alloc")]
impl<K, V> StoreIntoIterator<K, V> for ShortBoxSlice<(K, V)> {
type KeyValueIntoIter = ShortBoxSliceIntoIter<(K, V)>;
fn lm_into_iter(self) -> Self::KeyValueIntoIter {
self.into_iter()
}
// leave lm_extend_end as default
// leave lm_extend_start as default
}
#[test]
fn test_short_slice_impl() {
litemap::testing::check_store::<ShortBoxSlice<(u32, u64)>>();
}
#[test]
fn test_short_slice_impl_full() {
litemap::testing::check_store_full::<ShortBoxSlice<(u32, u64)>>();
}

View File

@@ -0,0 +1,404 @@
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
//! This module includes variable-length data types that are const-constructible for single
//! values and overflow to the heap.
//!
//! # Why?
//!
//! This module is far from the first stack-or-heap vector in the Rust ecosystem. It was created
//! with the following value proposition:
//!
//! 1. Enable safe const construction of stack collections.
//! 2. Avoid stack size penalties common with stack-or-heap collections.
//!
//! As of this writing, `heapless` and `tinyvec` don't support const construction except
//! for empty vectors, and `smallvec` supports it on unstable.
//!
//! Additionally, [`ShortBoxSlice`] has a smaller stack size than any of these:
//!
//! ```ignore
//! use core::mem::size_of;
//!
//! // NonZeroU64 has a niche that this module utilizes
//! use core::num::NonZeroU64;
//!
//! // ShortBoxSlice is the same size as `Box<[]>` for small or nichey values
//! assert_eq!(16, size_of::<shortvec::ShortBoxSlice::<NonZeroU64>>());
//!
//! // Note: SmallVec supports pushing and therefore has a capacity field
//! assert_eq!(24, size_of::<smallvec::SmallVec::<[NonZeroU64; 1]>>());
//!
//! // Note: heapless doesn't support spilling to the heap
//! assert_eq!(16, size_of::<heapless::Vec::<NonZeroU64, 1>>());
//!
//! // Note: TinyVec only supports types that implement `Default`
//! assert_eq!(24, size_of::<tinyvec::TinyVec::<[u64; 1]>>());
//! ```
//!
//! The module is `no_std` with `alloc`.
mod litemap;
#[cfg(feature = "alloc")]
use alloc::boxed::Box;
#[cfg(feature = "alloc")]
use alloc::vec;
#[cfg(feature = "alloc")]
use alloc::vec::Vec;
use core::ops::Deref;
use core::ops::DerefMut;
/// A boxed slice that supports no-allocation, constant values if length 0 or 1.
/// Using ZeroOne(Option<T>) saves 8 bytes in ShortBoxSlice via niche optimization.
#[derive(Debug, Clone, PartialEq, Eq, Hash, PartialOrd, Ord)]
pub(crate) enum ShortBoxSliceInner<T> {
ZeroOne(Option<T>),
#[cfg(feature = "alloc")]
Multi(Box<[T]>),
#[cfg(not(feature = "alloc"))]
Two([T; 2]),
}
impl<T> Default for ShortBoxSliceInner<T> {
fn default() -> Self {
use ShortBoxSliceInner::*;
ZeroOne(None)
}
}
/// A boxed slice that supports no-allocation, constant values if length 0 or 1.
///
/// Supports mutation but always reallocs when mutated.
#[derive(Debug, Clone, PartialEq, Eq, Hash, PartialOrd, Ord)]
pub(crate) struct ShortBoxSlice<T>(ShortBoxSliceInner<T>);
impl<T> Default for ShortBoxSlice<T> {
fn default() -> Self {
Self(Default::default())
}
}
impl<T> ShortBoxSlice<T> {
/// Creates a new, empty [`ShortBoxSlice`].
#[inline]
pub const fn new() -> Self {
use ShortBoxSliceInner::*;
Self(ZeroOne(None))
}
/// Creates a new [`ShortBoxSlice`] containing a single element.
#[inline]
pub const fn new_single(item: T) -> Self {
use ShortBoxSliceInner::*;
Self(ZeroOne(Some(item)))
}
pub fn new_double(first: T, second: T) -> Self {
use ShortBoxSliceInner::*;
#[cfg(feature = "alloc")]
return Self(Multi(vec![first, second].into_boxed_slice()));
#[cfg(not(feature = "alloc"))]
return Self(Two([first, second]));
}
/// Pushes an element onto this [`ShortBoxSlice`].
///
/// Reallocs if more than 1 item is already in the collection.
#[cfg(feature = "alloc")]
pub fn push(&mut self, item: T) {
use ShortBoxSliceInner::*;
self.0 = match core::mem::replace(&mut self.0, ZeroOne(None)) {
ZeroOne(None) => ZeroOne(Some(item)),
ZeroOne(Some(prev_item)) => Multi(vec![prev_item, item].into_boxed_slice()),
Multi(items) => {
let mut items = items.into_vec();
items.push(item);
Multi(items.into_boxed_slice())
}
};
}
/// Gets a single element from the [`ShortBoxSlice`].
///
/// Returns `None` if empty or more than one element.
#[inline]
pub const fn single(&self) -> Option<&T> {
use ShortBoxSliceInner::*;
match self.0 {
ZeroOne(Some(ref v)) => Some(v),
_ => None,
}
}
/// Destruct into a single element of the [`ShortBoxSlice`].
///
/// Returns `None` if empty or more than one element.
pub fn into_single(self) -> Option<T> {
use ShortBoxSliceInner::*;
match self.0 {
ZeroOne(Some(v)) => Some(v),
_ => None,
}
}
/// Returns the number of elements in the collection.
#[inline]
pub fn len(&self) -> usize {
use ShortBoxSliceInner::*;
match self.0 {
ZeroOne(None) => 0,
ZeroOne(_) => 1,
#[cfg(feature = "alloc")]
Multi(ref v) => v.len(),
#[cfg(not(feature = "alloc"))]
Two(_) => 2,
}
}
/// Returns whether the collection is empty.
#[inline]
pub const fn is_empty(&self) -> bool {
use ShortBoxSliceInner::*;
matches!(self.0, ZeroOne(None))
}
/// Inserts an element at the specified index into the collection.
///
/// Reallocs if more than 1 item is already in the collection.
#[cfg(feature = "alloc")]
pub fn insert(&mut self, index: usize, elt: T) {
use ShortBoxSliceInner::*;
assert!(
index <= self.len(),
"insertion index (is {}) should be <= len (is {})",
index,
self.len()
);
self.0 = match core::mem::replace(&mut self.0, ZeroOne(None)) {
ZeroOne(None) => ZeroOne(Some(elt)),
ZeroOne(Some(item)) => {
let items = if index == 0 {
vec![elt, item].into_boxed_slice()
} else {
vec![item, elt].into_boxed_slice()
};
Multi(items)
}
Multi(items) => {
let mut items = items.into_vec();
items.insert(index, elt);
Multi(items.into_boxed_slice())
}
}
}
/// Removes the element at the specified index from the collection.
///
/// Reallocs if more than 2 items are in the collection.
pub fn remove(&mut self, index: usize) -> T {
use ShortBoxSliceInner::*;
assert!(
index < self.len(),
"removal index (is {}) should be < len (is {})",
index,
self.len()
);
let (replaced, removed_item) = match core::mem::replace(&mut self.0, ZeroOne(None)) {
ZeroOne(None) => unreachable!(),
ZeroOne(Some(v)) => (ZeroOne(None), v),
#[cfg(feature = "alloc")]
Multi(v) => {
let mut v = v.into_vec();
let removed_item = v.remove(index);
match v.len() {
#[expect(clippy::unwrap_used)]
// we know that the vec has exactly one element left
1 => (ZeroOne(Some(v.pop().unwrap())), removed_item),
// v has at least 2 elements, create a Multi variant
_ => (Multi(v.into_boxed_slice()), removed_item),
}
}
#[cfg(not(feature = "alloc"))]
Two([f, s]) => (ZeroOne(Some(f)), s),
};
self.0 = replaced;
removed_item
}
/// Removes all elements from the collection.
#[inline]
pub fn clear(&mut self) {
use ShortBoxSliceInner::*;
let _ = core::mem::replace(&mut self.0, ZeroOne(None));
}
/// Retains only the elements specified by the predicate.
#[allow(dead_code)]
pub fn retain<F>(&mut self, mut f: F)
where
F: FnMut(&T) -> bool,
{
use ShortBoxSliceInner::*;
match core::mem::take(&mut self.0) {
ZeroOne(Some(one)) if f(&one) => self.0 = ZeroOne(Some(one)),
ZeroOne(_) => self.0 = ZeroOne(None),
#[cfg(feature = "alloc")]
Multi(slice) => {
let mut vec = slice.into_vec();
vec.retain(f);
*self = ShortBoxSlice::from(vec)
}
#[cfg(not(feature = "alloc"))]
Two([first, second]) => {
*self = match (Some(first).filter(&mut f), Some(second).filter(&mut f)) {
(None, None) => ShortBoxSlice::new(),
(None, Some(x)) | (Some(x), None) => ShortBoxSlice::new_single(x),
(Some(f), Some(s)) => ShortBoxSlice::new_double(f, s),
}
}
};
}
}
impl<T> Deref for ShortBoxSlice<T> {
type Target = [T];
fn deref(&self) -> &Self::Target {
use ShortBoxSliceInner::*;
match self.0 {
ZeroOne(None) => &[],
ZeroOne(Some(ref v)) => core::slice::from_ref(v),
#[cfg(feature = "alloc")]
Multi(ref v) => v,
#[cfg(not(feature = "alloc"))]
Two(ref v) => v,
}
}
}
impl<T> DerefMut for ShortBoxSlice<T> {
fn deref_mut(&mut self) -> &mut Self::Target {
use ShortBoxSliceInner::*;
match self.0 {
ZeroOne(None) => &mut [],
ZeroOne(Some(ref mut v)) => core::slice::from_mut(v),
#[cfg(feature = "alloc")]
Multi(ref mut v) => v,
#[cfg(not(feature = "alloc"))]
Two(ref mut v) => v,
}
}
}
#[cfg(feature = "alloc")]
impl<T> From<Vec<T>> for ShortBoxSlice<T> {
fn from(v: Vec<T>) -> Self {
use ShortBoxSliceInner::*;
match v.len() {
0 => Self(ZeroOne(None)),
#[expect(clippy::unwrap_used)] // we know that the vec is not empty
1 => Self(ZeroOne(Some(v.into_iter().next().unwrap()))),
_ => Self(Multi(v.into_boxed_slice())),
}
}
}
#[cfg(feature = "alloc")]
impl<T> FromIterator<T> for ShortBoxSlice<T> {
fn from_iter<I: IntoIterator<Item = T>>(iter: I) -> Self {
use ShortBoxSliceInner::*;
let mut iter = iter.into_iter();
match (iter.next(), iter.next()) {
(Some(first), Some(second)) => {
// Size hint behaviour same as `Vec::extend` + 2
let mut vec = Vec::with_capacity(iter.size_hint().0.saturating_add(3));
vec.push(first);
vec.push(second);
vec.extend(iter);
Self(Multi(vec.into_boxed_slice()))
}
(first, _) => Self(ZeroOne(first)),
}
}
}
/// An iterator that yields elements from a [`ShortBoxSlice`].
#[derive(Debug)]
pub struct ShortBoxSliceIntoIter<T>(ShortBoxSliceIntoIterInner<T>);
#[derive(Debug)]
pub(crate) enum ShortBoxSliceIntoIterInner<T> {
ZeroOne(Option<T>),
#[cfg(feature = "alloc")]
Multi(alloc::vec::IntoIter<T>),
#[cfg(not(feature = "alloc"))]
Two(core::array::IntoIter<T, 2>),
}
impl<T> Iterator for ShortBoxSliceIntoIter<T> {
type Item = T;
fn next(&mut self) -> Option<T> {
use ShortBoxSliceIntoIterInner::*;
match &mut self.0 {
ZeroOne(option) => option.take(),
#[cfg(feature = "alloc")]
Multi(into_iter) => into_iter.next(),
#[cfg(not(feature = "alloc"))]
Two(into_iter) => into_iter.next(),
}
}
}
impl<T> IntoIterator for ShortBoxSlice<T> {
type Item = T;
type IntoIter = ShortBoxSliceIntoIter<T>;
fn into_iter(self) -> Self::IntoIter {
match self.0 {
ShortBoxSliceInner::ZeroOne(option) => {
ShortBoxSliceIntoIter(ShortBoxSliceIntoIterInner::ZeroOne(option))
}
// TODO: Use a boxed slice IntoIter impl when available:
// <https://github.com/rust-lang/rust/issues/59878>
#[cfg(feature = "alloc")]
ShortBoxSliceInner::Multi(boxed_slice) => ShortBoxSliceIntoIter(
ShortBoxSliceIntoIterInner::Multi(boxed_slice.into_vec().into_iter()),
),
#[cfg(not(feature = "alloc"))]
ShortBoxSliceInner::Two(arr) => {
ShortBoxSliceIntoIter(ShortBoxSliceIntoIterInner::Two(arr.into_iter()))
}
}
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
#[expect(clippy::get_first)]
fn test_new_single_const() {
const MY_CONST_SLICE: ShortBoxSlice<i32> = ShortBoxSlice::new_single(42);
assert_eq!(MY_CONST_SLICE.len(), 1);
assert_eq!(MY_CONST_SLICE.get(0), Some(&42));
}
#[test]
#[expect(clippy::redundant_pattern_matching)]
fn test_get_single() {
let mut vec = ShortBoxSlice::new();
assert!(matches!(vec.single(), None));
vec.push(100);
assert!(matches!(vec.single(), Some(_)));
vec.push(200);
assert!(matches!(vec.single(), None));
}
}

View File

@@ -0,0 +1,59 @@
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
impl_tinystr_subtag!(
/// A language subtag (examples: `"en"`, `"csb"`, `"zh"`, `"und"`, etc.)
///
/// [`Language`] represents a Unicode base language code conformant to the
/// [`unicode_language_id`] field of the Language and Locale Identifier.
///
/// # Examples
///
/// ```
/// use icu::locale::subtags::Language;
///
/// let language: Language =
/// "en".parse().expect("Failed to parse a language subtag.");
/// ```
///
/// If the [`Language`] has no value assigned, it serializes to a string `"und"`, which
/// can be then parsed back to an empty [`Language`] field.
///
/// # Examples
///
/// ```
/// use icu::locale::subtags::Language;
///
/// assert_eq!(Language::UNKNOWN.as_str(), "und");
/// ```
///
/// `Notice`: ICU4X uses a narrow form of language subtag of 2-3 characters.
/// The specification allows language subtag to optionally also be 5-8 characters
/// but that form has not been used and ICU4X does not support it right now.
///
/// [`unicode_language_id`]: https://unicode.org/reports/tr35/#unicode_language_id
Language,
subtags,
language,
subtags_language,
2..=3,
s,
s.is_ascii_alphabetic(),
s.to_ascii_lowercase(),
s.is_ascii_alphabetic_lowercase(),
InvalidLanguage,
["en", "foo"],
["419", "german", "en1"],
);
impl Language {
/// The unknown language "und".
pub const UNKNOWN: Self = language!("und");
/// Whether this [`Language`] equals [`Language::UNKNOWN`].
#[inline]
pub const fn is_unknown(self) -> bool {
matches!(self, Self::UNKNOWN)
}
}

View File

@@ -0,0 +1,163 @@
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
//! Language Identifier and Locale contains a set of subtags
//! which represent different fields of the structure.
//!
//! * [`Language`] is the only mandatory field, which when empty,
//! takes the value `und`.
//! * [`Script`] is an optional field representing the written script used by the locale.
//! * [`Region`] is the region used by the locale.
//! * [`Variants`] is a list of optional [`Variant`] subtags containing information about the
//! variant adjustments used by the locale.
//!
//! Subtags can be used in isolation, and all basic operations such as parsing, syntax normalization
//! and serialization are supported on each individual subtag, but most commonly
//! they are used to construct a [`LanguageIdentifier`] instance.
//!
//! [`Variants`] is a special structure which contains a list of [`Variant`] subtags.
//! It is wrapped around to allow for sorting and deduplication of variants, which
//! is one of the required steps of language identifier and locale syntax normalization.
//!
//! # Examples
//!
//! ```
//! use icu::locale::subtags::{Language, Region, Script, Variant};
//!
//! let language: Language =
//! "en".parse().expect("Failed to parse a language subtag.");
//! let script: Script =
//! "arab".parse().expect("Failed to parse a script subtag.");
//! let region: Region =
//! "cn".parse().expect("Failed to parse a region subtag.");
//! let variant: Variant =
//! "MacOS".parse().expect("Failed to parse a variant subtag.");
//!
//! assert_eq!(language.as_str(), "en");
//! assert_eq!(script.as_str(), "Arab");
//! assert_eq!(region.as_str(), "CN");
//! assert_eq!(variant.as_str(), "macos");
//! ```
//!
//! `Notice`: The subtags are normalized on parsing. That means
//! that all operations work on a normalized version of the subtag
//! and serialization is very cheap.
//!
//! [`LanguageIdentifier`]: super::LanguageIdentifier
mod language;
mod region;
mod script;
mod variant;
mod variants;
#[doc(inline)]
pub use language::{language, Language};
#[doc(inline)]
pub use region::{region, Region};
#[doc(inline)]
pub use script::{script, Script};
#[doc(inline)]
pub use variant::{variant, Variant};
pub use variants::Variants;
impl_tinystr_subtag!(
/// A generic subtag.
///
/// The subtag has to be an ASCII alphanumerical string no shorter than
/// two characters and no longer than eight.
///
/// # Examples
///
/// ```
/// use icu::locale::subtags::Subtag;
///
/// let subtag1: Subtag = "Foo".parse()
/// .expect("Failed to parse a Subtag.");
///
/// assert_eq!(subtag1.as_str(), "foo");
/// ```
Subtag,
subtags,
subtag,
subtags_subtag,
2..=8,
s,
s.is_ascii_alphanumeric(),
s.to_ascii_lowercase(),
s.is_ascii_alphanumeric() && s.is_ascii_lowercase(),
InvalidSubtag,
["foo12"],
["f", "toolooong"],
);
#[expect(clippy::len_without_is_empty)]
impl Subtag {
#[allow(dead_code)]
pub(crate) const fn valid_key(v: &[u8]) -> bool {
2 <= v.len() && v.len() <= 8
}
/// Returns the length of `self`.
///
/// # Examples
///
/// ```
/// use icu::locale::subtags::subtag;
/// let s = subtag!("foo");
/// assert_eq!(s.len(), 3);
/// ```
pub fn len(&self) -> usize {
self.0.len()
}
#[doc(hidden)]
pub fn from_tinystr_unvalidated(input: tinystr::TinyAsciiStr<8>) -> Self {
Self(input)
}
#[doc(hidden)]
pub fn as_tinystr(&self) -> tinystr::TinyAsciiStr<8> {
self.0
}
#[allow(dead_code)]
pub(crate) fn to_ascii_lowercase(self) -> Self {
Self(self.0.to_ascii_lowercase())
}
}
impl<const N: usize> TryFrom<tinystr::TinyAsciiStr<N>> for Subtag {
type Error = crate::parser::errors::ParseError;
fn try_from(value: tinystr::TinyAsciiStr<N>) -> Result<Self, Self::Error> {
Self::try_from_str(&value)
}
}
impl PartialEq<str> for Subtag {
fn eq(&self, other: &str) -> bool {
self.0 == other
}
}
#[cfg(test)]
mod tests {
use super::*;
use tinystr::tinystr;
#[test]
fn test_subtag() {
let subtag = subtag!("foo");
assert_eq!(subtag.as_str(), "foo");
}
#[test]
fn test_subtag_from_tinystr() {
let subtag = Subtag::try_from(tinystr!(3, "foo"));
assert!(subtag.is_ok());
let subtag = Subtag::try_from(tinystr!(1, "f"));
assert!(subtag.is_err());
}
}

View File

@@ -0,0 +1,60 @@
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
impl_tinystr_subtag!(
/// A region subtag (examples: `"US"`, `"CN"`, `"AR"` etc.)
///
/// [`Region`] represents a Unicode base language code conformant to the
/// [`unicode_region_id`] field of the Language and Locale Identifier.
///
/// # Examples
///
/// ```
/// use icu::locale::subtags::Region;
///
/// let region: Region =
/// "DE".parse().expect("Failed to parse a region subtag.");
/// ```
///
/// [`unicode_region_id`]: https://unicode.org/reports/tr35/#unicode_region_id
Region,
subtags,
region,
subtags_region,
2..=3,
s,
if s.len() == 2 {
s.is_ascii_alphabetic()
} else {
s.is_ascii_numeric()
},
if s.len() == 2 {
s.to_ascii_uppercase()
} else {
s
},
if s.len() == 2 {
s.is_ascii_alphabetic_uppercase()
} else {
s.is_ascii_numeric()
},
InvalidSubtag,
["FR", "123"],
["12", "FRA", "b2"],
);
impl Region {
/// Returns true if the Region has an alphabetic code.
///
/// # Examples
///
/// ```
/// use icu::locale::subtags::region;
///
/// assert!(region!("us").is_alphabetic());
/// ```
pub fn is_alphabetic(&self) -> bool {
self.0.len() == 2
}
}

View File

@@ -0,0 +1,41 @@
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
use crate::subtags::Subtag;
impl_tinystr_subtag!(
/// A script subtag (examples: `"Latn"`, `"Arab"`, etc.)
///
/// [`Script`] represents a Unicode base language code conformant to the
/// [`unicode_script_id`] field of the Language and Locale Identifier.
///
/// # Examples
///
/// ```
/// use icu::locale::subtags::Script;
///
/// let script: Script =
/// "Latn".parse().expect("Failed to parse a script subtag.");
/// ```
///
/// [`unicode_script_id`]: https://unicode.org/reports/tr35/#unicode_script_id
Script,
subtags,
script,
subtags_script,
4..=4,
s,
s.is_ascii_alphabetic(),
s.to_ascii_titlecase(),
s.is_ascii_alphabetic_titlecase(),
InvalidSubtag,
["Latn"],
["Latin"],
);
impl From<Script> for Subtag {
fn from(value: Script) -> Self {
Subtag(value.0.resize())
}
}

View File

@@ -0,0 +1,35 @@
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
impl_tinystr_subtag!(
/// A variant subtag (examples: `"macos"`, `"posix"`, `"1996"` etc.)
///
/// [`Variant`] represents a Unicode base language code conformant to the
/// [`unicode_variant_id`] field of the Language and Locale Identifier.
///
/// # Examples
///
/// ```
/// use icu::locale::subtags::Variant;
///
/// let variant: Variant =
/// "macos".parse().expect("Failed to parse a variant subtag.");
/// ```
///
/// [`unicode_variant_id`]: https://unicode.org/reports/tr35/#unicode_variant_id
Variant,
subtags,
variant,
subtags_variant,
4..=8,
s,
s.is_ascii_alphanumeric() && (s.len() != 4 || s.all_bytes()[0].is_ascii_digit()),
s.to_ascii_lowercase(),
s.is_ascii_lowercase()
&& s.is_ascii_alphanumeric()
&& (s.len() != 4 || s.all_bytes()[0].is_ascii_digit()),
InvalidSubtag,
["posix", "1996"],
["yes"],
);

View File

@@ -0,0 +1,138 @@
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
use super::Variant;
use crate::shortvec::ShortBoxSlice;
#[cfg(feature = "alloc")]
use alloc::vec::Vec;
use core::ops::Deref;
/// A list of variants (examples: `["macos", "posix"]`, etc.)
///
/// [`Variants`] stores a list of [`Variant`] subtags in a canonical form
/// by sorting and deduplicating them.
///
/// # Examples
///
/// ```
/// use icu::locale::subtags::{variant, Variants};
///
/// let mut v = vec![variant!("posix"), variant!("macos")];
/// v.sort();
/// v.dedup();
///
/// let variants: Variants = Variants::from_vec_unchecked(v);
/// assert_eq!(variants.to_string(), "macos-posix");
/// ```
#[derive(Default, Debug, PartialEq, Eq, Clone, Hash, PartialOrd, Ord)]
pub struct Variants(ShortBoxSlice<Variant>);
impl Variants {
/// Returns a new empty list of variants. Same as [`default()`](Default::default()), but is `const`.
///
/// # Examples
///
/// ```
/// use icu::locale::subtags::Variants;
///
/// assert_eq!(Variants::new(), Variants::default());
/// ```
#[inline]
pub const fn new() -> Self {
Self(ShortBoxSlice::new())
}
/// Creates a new [`Variants`] set from a single [`Variant`].
///
/// # Examples
///
/// ```
/// use icu::locale::subtags::{variant, Variants};
///
/// let variants = Variants::from_variant(variant!("posix"));
/// ```
#[inline]
pub const fn from_variant(variant: Variant) -> Self {
Self(ShortBoxSlice::new_single(variant))
}
/// Creates a new [`Variants`] set from a [`Vec`].
/// The caller is expected to provide sorted and deduplicated vector as
/// an input.
///
/// ✨ *Enabled with the `alloc` Cargo feature.*
///
/// # Examples
///
/// ```
/// use icu::locale::subtags::{variant, Variants};
///
/// let mut v = vec![variant!("posix"), variant!("macos")];
/// v.sort();
/// v.dedup();
///
/// let variants = Variants::from_vec_unchecked(v);
/// ```
///
/// Notice: For performance- and memory-constrained environments, it is recommended
/// for the caller to use [`binary_search`](slice::binary_search) instead of [`sort`](slice::sort)
/// and [`dedup`](Vec::dedup()).
#[cfg(feature = "alloc")]
pub fn from_vec_unchecked(input: Vec<Variant>) -> Self {
Self(input.into())
}
#[cfg(feature = "alloc")]
pub(crate) fn from_short_slice_unchecked(input: ShortBoxSlice<Variant>) -> Self {
Self(input)
}
/// Empties the [`Variants`] list.
///
/// Returns the old list.
///
/// # Examples
///
/// ```
/// use icu::locale::subtags::{variant, Variants};
///
/// let mut v = vec![variant!("posix"), variant!("macos")];
/// v.sort();
/// v.dedup();
///
/// let mut variants: Variants = Variants::from_vec_unchecked(v);
///
/// assert_eq!(variants.to_string(), "macos-posix");
///
/// variants.clear();
///
/// assert_eq!(variants, Variants::default());
/// ```
pub fn clear(&mut self) -> Self {
core::mem::take(self)
}
/// Whether the list of variants is empty.
pub const fn is_empty(&self) -> bool {
self.0.is_empty()
}
pub(crate) fn for_each_subtag_str<E, F>(&self, f: &mut F) -> Result<(), E>
where
F: FnMut(&str) -> Result<(), E>,
{
self.deref().iter().map(|t| t.as_str()).try_for_each(f)
}
}
impl_writeable_for_subtag_list!(Variants, "macos", "posix");
impl Deref for Variants {
type Target = [Variant];
fn deref(&self) -> &[Variant] {
self.0.deref()
}
}

131
vendor/icu_locale_core/src/zerovec.rs vendored Normal file
View File

@@ -0,0 +1,131 @@
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
//! Documentation on zero-copy deserialization of locale types.
//!
//! [`Locale`] and [`LanguageIdentifier`] are highly structured types that cannot be directly
//! stored in a zero-copy data structure, such as those provided by the [`zerovec`](crate::zerovec) module.
//! This page explains how to indirectly store these types in a [`zerovec`](crate::zerovec).
//!
//! There are two main use cases, which have different solutions:
//!
//! 1. **Lookup:** You need to locate a locale in a zero-copy vector, such as when querying a map.
//! 2. **Obtain:** You have a locale stored in a zero-copy vector, and you need to obtain a proper
//! [`Locale`] or [`LanguageIdentifier`] for use elsewhere in your program.
//!
//! # Lookup
//!
//! To perform lookup, store the stringified locale in a canonical BCP-47 form as a byte array,
//! and then use [`Locale::strict_cmp()`] to perform an efficient, zero-allocation lookup.
//!
//! To produce more human-readable serialized output, you can use `PotentialUtf8`.
//!
//! ```
//! use icu::locale::Locale;
//! use potential_utf::PotentialUtf8;
//! use zerovec::ZeroMap;
//!
//! // ZeroMap from locales to integers
//! let data: &[(&PotentialUtf8, u32)] = &[
//! ("de-DE-u-hc-h12".into(), 5),
//! ("en-US-u-ca-buddhist".into(), 10),
//! ("my-MM".into(), 15),
//! ("sr-Cyrl-ME".into(), 20),
//! ("zh-TW".into(), 25),
//! ];
//! let zm: ZeroMap<PotentialUtf8, u32> = data.iter().copied().collect();
//!
//! // Get the value associated with a locale
//! let loc: Locale = "en-US-u-ca-buddhist".parse().unwrap();
//! let value = zm.get_copied_by(|uvstr| loc.strict_cmp(uvstr).reverse());
//! assert_eq!(value, Some(10));
//! ```
//!
//! # Obtain
//!
//! Obtaining a [`Locale`] or [`LanguageIdentifier`] is not generally a zero-copy operation, since
//! both of these types may require memory allocation. If possible, architect your code such that
//! you do not need to obtain a structured type.
//!
//! If you need the structured type, such as if you need to manipulate it in some way, there are two
//! options: storing subtags, and storing a string for parsing.
//!
//! ## Storing Subtags
//!
//! If the data being stored only contains a limited number of subtags, you can store them as a
//! tuple, and then construct the [`LanguageIdentifier`] externally.
//!
//! ```
//! use icu::locale::subtags::{Language, Region, Script};
//! use icu::locale::LanguageIdentifier;
//! use icu::locale::{
//! langid,
//! subtags::{language, region, script},
//! };
//! use zerovec::ZeroMap;
//!
//! // ZeroMap from integer to LSR (language-script-region)
//! let zm: ZeroMap<u32, (Language, Option<Script>, Option<Region>)> = [
//! (5, (language!("de"), None, Some(region!("DE")))),
//! (10, (language!("en"), None, Some(region!("US")))),
//! (15, (language!("my"), None, Some(region!("MM")))),
//! (
//! 20,
//! (language!("sr"), Some(script!("Cyrl")), Some(region!("ME"))),
//! ),
//! (25, (language!("zh"), None, Some(region!("TW")))),
//! ]
//! .into_iter()
//! .collect();
//!
//! // Construct a LanguageIdentifier from a tuple entry
//! let lid: LanguageIdentifier =
//! zm.get_copied(&25).expect("element is present").into();
//!
//! assert_eq!(lid, langid!("zh-TW"));
//! ```
//!
//! ## Storing Strings
//!
//! If it is necessary to store and obtain an arbitrary locale, it is currently recommended to
//! store a BCP-47 string and parse it when needed.
//!
//! Since the string is stored in an unparsed state, it is not safe to `unwrap` the result from
//! `Locale::try_from_utf8()`. See [icu4x#831](https://github.com/unicode-org/icu4x/issues/831)
//! for a discussion on potential data models that could ensure that the locale is valid during
//! deserialization.
//!
//! As above, to produce more human-readable serialized output, you can use `PotentialUtf8`.
//!
//! ```
//! use icu::locale::langid;
//! use icu::locale::Locale;
//! use potential_utf::PotentialUtf8;
//! use zerovec::ZeroMap;
//!
//! // ZeroMap from integer to locale string
//! let data: &[(u32, &PotentialUtf8)] = &[
//! (5, "de-DE-u-hc-h12".into()),
//! (10, "en-US-u-ca-buddhist".into()),
//! (15, "my-MM".into()),
//! (20, "sr-Cyrl-ME".into()),
//! (25, "zh-TW".into()),
//! (30, "INVALID".into()),
//! ];
//! let zm: ZeroMap<u32, PotentialUtf8> = data.iter().copied().collect();
//!
//! // Construct a Locale by parsing the string.
//! let value = zm.get(&25).expect("element is present");
//! let loc = Locale::try_from_utf8(value);
//! assert_eq!(loc, Ok(langid!("zh-TW").into()));
//!
//! // Invalid entries are fallible
//! let err_value = zm.get(&30).expect("element is present");
//! let err_loc = Locale::try_from_utf8(err_value);
//! assert!(err_loc.is_err());
//! ```
//!
//! [`Locale`]: crate::Locale
//! [`Locale::strict_cmp()`]: crate::Locale::strict_cmp()
//! [`LanguageIdentifier`]: crate::LanguageIdentifier