chore: checkpoint before Python removal

This commit is contained in:
2026-03-26 22:33:59 +00:00
parent 683cec9307
commit e568ddf82a
29972 changed files with 11269302 additions and 2 deletions

View File

@@ -0,0 +1 @@
{"files":{".cargo_vcs_info.json":"75992dd22cc724a459b542a67a921e80396f5849eb42d9ebf059a39654f9a9d6","Cargo.lock":"6103b6ae33368ba7383a9c85d2534b26c907012434faf758006fea61706aed6a","Cargo.toml":"24273bcc1f268b0a559f70d25e7278cae2fdbb2aa8720876e713953bfb9d568b","Cargo.toml.orig":"1c8a5ef7cda3f66eb283bae506e5750329376558c29be6fefcbe60979d3fea3b","LICENSE":"f367c1b8e1aa262435251e442901da4607b4650e0e63a026f5044473ecfb90f2","README.md":"aec56e279d7e40a901b47a2eccb52197fde6c9499011b349c5ef509363bee6a9","benches/bench.rs":"c863372f937f0bf3c5cf214dd3729a88d450fa67f5d054753c2d9ea31fa84286","benches/canonical_composition.rs":"0aa91d5d400f58da61865f5fabe878c8506e60466c78503f77041ef7257e6dbe","benches/canonical_decomposition.rs":"3b44b8f832e426e8c82e449743117182ab7b138288001b621ccc9325b4c27b6c","benches/composing_normalizer_nfc.rs":"9a7aaae94e0096ccac9f3d1a83585c3f449af87f9f0f8b05615d2a010078e3e8","benches/composing_normalizer_nfkc.rs":"ad92d562a1e9aad3611521526882e1896aa436d2ac59493c8c00686c57bdf31e","benches/data/README.md":"fa79b84815a228c3fbfa5d4c6d12885036994ca8ad61e683b2113cf2b428bb85","benches/data/TestNames_Japanese_h.txt":"6522f8ed794ad348c904079082ec3aa303ae7acf3f68bbc49fa0ee90eebf31e0","benches/data/TestNames_Japanese_k.txt":"e4e18804fe742ecd27ae48bc3564c6bc653180a3c649d43a2ab4d8b7f2607627","benches/data/TestNames_Korean.txt":"9cbf54d5ee16726c0fc9477366e273ba1b82e651c9e88e6c7532df5344f03920","benches/data/TestNames_Latin.txt":"3a30d450d259a6be4a6aee8eeef08d3767d11fcc047b8f58060c542efe1182d1","benches/data/TestNames_Thai.txt":"28d76ddb62d6f47646232860fce7440544f402158443889393fd7e8bf10e9c3d","benches/data/TestRandomWordsUDHR_ar.txt":"02a775153e9746ae938a9db0b60244f2c00d911bb72b611a3593b0991fd95723","benches/data/TestRandomWordsUDHR_de.txt":"100b9502e7ddcb2fcbd055cb7ec9113245105bd1c606cace5e5bc147cc18727b","benches/data/TestRandomWordsUDHR_el.txt":"d1a2f0f9efc9ce663026ca7c285177391937c90008479a8c5b909c300dc86972","benches/data/TestRandomWordsUDHR_es.txt":"deeebda09e0ce0f80dd805317e96d1a630908601ff2a4d1ccb0021b00b55814b","benches/data/TestRandomWordsUDHR_fr.txt":"5931edc9f1af2c27a0b35c9624732e70b87b0fd72ab486710f3aa6367c7ad35f","benches/data/TestRandomWordsUDHR_he.txt":"dc77a89ffb9803e5c574d87f4789cb17624df73e40a8a92961df8ea8be103425","benches/data/TestRandomWordsUDHR_pl.txt":"26c378295ee2ef75ccacea691df0456394184a9a5c9ce48b2bada169b2402bbb","benches/data/TestRandomWordsUDHR_ru.txt":"a1c339f6d7b69cf9154e855c290ab09eeaf167ebcdf6d4bcb917de039fba10ee","benches/data/TestRandomWordsUDHR_th.txt":"3ba518be9863c85c3ac80cbb12299e3594e6f5afed3406d910d948007adaaf4e","benches/data/TestRandomWordsUDHR_tr.txt":"815c7babbc7228ef89b56f29638aeb63013aeca0003a49e58994e26b41cba01c","benches/data/wotw.txt":"8f28e68041ce75bbf75e72e186a6145e4c2de9e7e62b9b86ce0621c527a23669","benches/decomposing_normalizer_nfd.rs":"28f3d54c9af813af7ac9d0fbc9d45a7a6d27a25266bd593453eb35c1894280b5","benches/decomposing_normalizer_nfkd.rs":"cbaa2755878ee1cc90170210fddb7c79836457f89eb84f4f32fb51348f350bd5","benches/utf16_throughput.rs":"8b8065ccb2d31d7191ba4916e00c341bcc00df56fc7dac3d7a42de907a3c8d94","src/lib.rs":"46ea272d5821a752e29f6622b23b5549bf1b008775a0eb6c38bebdbe9c6191eb","src/properties.rs":"d4dcba8900c68ee08db4b00695edfbf83a7e325fb7f970bb96c40ff26afc3d80","src/provider.rs":"5850afc7ae842c7af74ce029be256944c64f5d0b51d95725a8366f5af22163e9","src/uts46.rs":"de86f63076dba6e5f95dfc99865db8ce84820190d54185b0380c737b8758247a","tests/data/NormalizationTest.txt":"1b04c22b82064adf871e76fd2148cd749129163f7d05bd7ace923516a65afe02","tests/data/README.md":"521fcd44a1f10f21629df88113fa29ca9f4e1dfbeea79fda19a7dc8ba435e24b","tests/tests.rs":"01db1c9dc1c7c71f80aed528e4309f416349af9eec887d2e438a3a11f2ee7f7c"},"package":"5f6c8828b67bf8908d82127b2054ea1b4427ff0230ee9141c54251934ab1b599"}

View File

@@ -0,0 +1,6 @@
{
"git": {
"sha1": "38a49da495248dd1ded84cf306e4ca42e64d5bb3"
},
"path_in_vcs": "components/normalizer"
}

943
vendor/icu_normalizer/Cargo.lock generated vendored Normal file
View File

@@ -0,0 +1,943 @@
# This file is automatically @generated by Cargo.
# It is not intended for manual editing.
version = 3
[[package]]
name = "aho-corasick"
version = "1.1.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8e60d3430d3a69478ad0993f19238d2df97c507009a52b3c10addcd7f6bcb916"
dependencies = [
"memchr",
]
[[package]]
name = "anes"
version = "0.1.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4b46cbb362ab8752921c97e041f5e366ee6297bd428a31275b9fcf1e380f7299"
[[package]]
name = "anstyle"
version = "1.0.13"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5192cca8006f1fd4f7237516f40fa183bb07f8fbdfedaa0036de5ea9b0b45e78"
[[package]]
name = "arraystring"
version = "0.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4d517c467117e1d8ca795bc8cc90857ff7f79790cca0e26f6e9462694ece0185"
dependencies = [
"typenum",
]
[[package]]
name = "arrayvec"
version = "0.7.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7c02d123df017efcdfbd739ef81735b36c5ba83ec3c59c80a9d7ecc718f92e50"
[[package]]
name = "atoi"
version = "2.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f28d99ec8bfea296261ca1af174f24225171fea9664ba9003cbebee704810528"
dependencies = [
"num-traits",
]
[[package]]
name = "autocfg"
version = "1.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c08606f8c3cbf4ce6ec8e28fb0014a2c086708fe954eaa885384a6165172e7e8"
[[package]]
name = "bumpalo"
version = "3.19.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "46c5e41b57b8bba42a04676d81cb89e9ee8e859a1a66f80a5a72e1cb76b34d43"
[[package]]
name = "cast"
version = "0.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "37b2a672a2cb129a2e41c10b1224bb368f9f37a2b16b612598138befd7b37eb5"
[[package]]
name = "cfg-if"
version = "1.0.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9330f8b2ff13f34540b44e946ef35111825727b38d33286ef986142615121801"
[[package]]
name = "ciborium"
version = "0.2.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "42e69ffd6f0917f5c029256a24d0161db17cea3997d185db0d35926308770f0e"
dependencies = [
"ciborium-io",
"ciborium-ll",
"serde",
]
[[package]]
name = "ciborium-io"
version = "0.2.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "05afea1e0a06c9be33d539b876f1ce3692f4afea2cb41f740e7743225ed1c757"
[[package]]
name = "ciborium-ll"
version = "0.2.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "57663b653d948a338bfb3eeba9bb2fd5fcfaecb9e199e87e1eda4d9e8b240fd9"
dependencies = [
"ciborium-io",
"half",
]
[[package]]
name = "clap"
version = "4.4.18"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1e578d6ec4194633722ccf9544794b71b1385c3c027efe0c55db226fc880865c"
dependencies = [
"clap_builder",
]
[[package]]
name = "clap_builder"
version = "4.4.18"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4df4df40ec50c46000231c914968278b1eb05098cf8f1b3a518a95030e71d1c7"
dependencies = [
"anstyle",
"clap_lex",
]
[[package]]
name = "clap_lex"
version = "0.6.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "702fc72eb24e5a1e48ce58027a675bc24edd52096d5397d4aea7c6dd9eca0bd1"
[[package]]
name = "cobs"
version = "0.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0fa961b519f0b462e3a3b4a34b64d119eeaca1d59af726fe450bbba07a9fc0a1"
dependencies = [
"thiserror",
]
[[package]]
name = "criterion"
version = "0.5.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f2b12d017a929603d80db1831cd3a24082f8137ce19c69e6447f54f5fc8d692f"
dependencies = [
"anes",
"cast",
"ciborium",
"clap",
"criterion-plot",
"is-terminal",
"itertools",
"num-traits",
"once_cell",
"oorandom",
"plotters",
"rayon",
"regex",
"serde",
"serde_derive",
"serde_json",
"tinytemplate",
"walkdir",
]
[[package]]
name = "criterion-plot"
version = "0.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6b50826342786a51a89e2da3a28f1c32b06e387201bc2d19791f622c673706b1"
dependencies = [
"cast",
"itertools",
]
[[package]]
name = "crossbeam-deque"
version = "0.8.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9dd111b7b7f7d55b72c0a6ae361660ee5853c9af73f70c3c2ef6858b950e2e51"
dependencies = [
"crossbeam-epoch",
"crossbeam-utils",
]
[[package]]
name = "crossbeam-epoch"
version = "0.9.18"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5b82ac4a3c2ca9c3460964f020e1402edd5753411d7737aa39c3714ad1b5420e"
dependencies = [
"crossbeam-utils",
]
[[package]]
name = "crossbeam-utils"
version = "0.8.21"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28"
[[package]]
name = "crunchy"
version = "0.2.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "460fbee9c2c2f33933d720630a6a0bac33ba7053db5344fac858d4b8952d77d5"
[[package]]
name = "databake"
version = "0.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ff6ee9e2d2afb173bcdeee45934c89ec341ab26f91c9933774fc15c2b58f83ef"
dependencies = [
"databake-derive",
"proc-macro2",
"quote",
]
[[package]]
name = "databake-derive"
version = "0.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6834770958c7b84223607e49758ec0dde273c4df915e734aad50f62968a4c134"
dependencies = [
"proc-macro2",
"quote",
"syn",
"synstructure",
]
[[package]]
name = "detone"
version = "1.0.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5d5b580660e7375410c9199e84aa298f919925fb53d8cc9b02d8010ff5a14d09"
[[package]]
name = "displaydoc"
version = "0.2.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "97369cbbc041bc366949bc74d34658d6cda5621039731c6310521892a3a20ae0"
dependencies = [
"proc-macro2",
"quote",
"syn",
]
[[package]]
name = "either"
version = "1.15.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "48c757948c5ede0e46177b7add2e67155f70e33c07fea8284df6576da70b3719"
[[package]]
name = "erased-serde"
version = "0.4.8"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "259d404d09818dec19332e31d94558aeb442fea04c817006456c24b5460bbd4b"
dependencies = [
"serde",
"serde_core",
"typeid",
]
[[package]]
name = "half"
version = "2.4.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6dd08c532ae367adf81c312a4580bc67f1d0fe8bc9c460520283f4c0ff277888"
dependencies = [
"cfg-if",
"crunchy",
]
[[package]]
name = "hermit-abi"
version = "0.5.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "fc0fef456e4baa96da950455cd02c081ca953b141298e41db3fc7e36b1da849c"
[[package]]
name = "icu_collections"
version = "2.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4c6b649701667bbe825c3b7e6388cb521c23d88644678e83c0c4d0a621a34b43"
dependencies = [
"databake",
"displaydoc",
"potential_utf",
"serde",
"yoke",
"zerofrom",
"zerovec",
]
[[package]]
name = "icu_locale_core"
version = "2.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "edba7861004dd3714265b4db54a3c390e880ab658fec5f7db895fae2046b5bb6"
dependencies = [
"databake",
"displaydoc",
"litemap",
"serde",
"tinystr",
"writeable",
"zerovec",
]
[[package]]
name = "icu_normalizer"
version = "2.1.1"
dependencies = [
"arraystring",
"arrayvec",
"atoi",
"criterion",
"databake",
"detone",
"icu_collections",
"icu_normalizer_data",
"icu_properties",
"icu_provider",
"serde",
"smallvec",
"utf16_iter",
"utf8_iter",
"write16",
"zerovec",
]
[[package]]
name = "icu_normalizer_data"
version = "2.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7aedcccd01fc5fe81e6b489c15b247b8b0690feb23304303a9e560f37efc560a"
[[package]]
name = "icu_properties"
version = "2.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e93fcd3157766c0c8da2f8cff6ce651a31f0810eaa1c51ec363ef790bbb5fb99"
dependencies = [
"databake",
"icu_collections",
"icu_locale_core",
"icu_properties_data",
"icu_provider",
"serde",
"zerotrie",
"zerovec",
]
[[package]]
name = "icu_properties_data"
version = "2.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "02845b3647bb045f1100ecd6480ff52f34c35f82d9880e029d329c21d1054899"
[[package]]
name = "icu_provider"
version = "2.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "85962cf0ce02e1e0a629cc34e7ca3e373ce20dda4c4d7294bbd0bf1fdb59e614"
dependencies = [
"databake",
"displaydoc",
"erased-serde",
"icu_locale_core",
"postcard",
"serde",
"stable_deref_trait",
"writeable",
"yoke",
"zerofrom",
"zerotrie",
"zerovec",
]
[[package]]
name = "is-terminal"
version = "0.4.17"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3640c1c38b8e4e43584d8df18be5fc6b0aa314ce6ebf51b53313d4306cca8e46"
dependencies = [
"hermit-abi",
"libc",
"windows-sys",
]
[[package]]
name = "itertools"
version = "0.10.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b0fd2260e829bddf4cb6ea802289de2f86d6a7a690192fbe91b3f46e0f2c8473"
dependencies = [
"either",
]
[[package]]
name = "itoa"
version = "1.0.15"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4a5f13b858c8d314ee3e8f639011f7ccefe71f97f96e50151fb991f267928e2c"
[[package]]
name = "js-sys"
version = "0.3.81"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ec48937a97411dcb524a265206ccd4c90bb711fca92b2792c407f268825b9305"
dependencies = [
"once_cell",
"wasm-bindgen",
]
[[package]]
name = "libc"
version = "0.2.177"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2874a2af47a2325c2001a6e6fad9b16a53b802102b528163885171cf92b15976"
[[package]]
name = "litemap"
version = "0.8.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6373607a59f0be73a39b6fe456b8192fcc3585f602af20751600e974dd455e77"
dependencies = [
"serde_core",
]
[[package]]
name = "log"
version = "0.4.28"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "34080505efa8e45a4b816c349525ebe327ceaa8559756f0356cba97ef3bf7432"
[[package]]
name = "memchr"
version = "2.7.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f52b00d39961fc5b2736ea853c9cc86238e165017a493d1d5c8eac6bdc4cc273"
[[package]]
name = "num-traits"
version = "0.2.19"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "071dfc062690e90b734c0b2273ce72ad0ffa95f0c74596bc250dcfd960262841"
dependencies = [
"autocfg",
]
[[package]]
name = "once_cell"
version = "1.21.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "42f5e15c9953c5e4ccceeb2e7382a716482c34515315f7b03532b8b4e8393d2d"
[[package]]
name = "oorandom"
version = "11.1.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d6790f58c7ff633d8771f42965289203411a5e5c68388703c06e14f24770b41e"
[[package]]
name = "plotters"
version = "0.3.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5aeb6f403d7a4911efb1e33402027fc44f29b5bf6def3effcc22d7bb75f2b747"
dependencies = [
"num-traits",
"plotters-backend",
"plotters-svg",
"wasm-bindgen",
"web-sys",
]
[[package]]
name = "plotters-backend"
version = "0.3.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "df42e13c12958a16b3f7f4386b9ab1f3e7933914ecea48da7139435263a4172a"
[[package]]
name = "plotters-svg"
version = "0.3.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "51bae2ac328883f7acdfea3d66a7c35751187f870bc81f94563733a154d7a670"
dependencies = [
"plotters-backend",
]
[[package]]
name = "postcard"
version = "1.1.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6764c3b5dd454e283a30e6dfe78e9b31096d9e32036b5d1eaac7a6119ccb9a24"
dependencies = [
"cobs",
"serde",
]
[[package]]
name = "potential_utf"
version = "0.1.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b73949432f5e2a09657003c25bca5e19a0e9c84f8058ca374f49e0ebe605af77"
dependencies = [
"serde_core",
"zerovec",
]
[[package]]
name = "proc-macro2"
version = "1.0.103"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5ee95bc4ef87b8d5ba32e8b7714ccc834865276eab0aed5c9958d00ec45f49e8"
dependencies = [
"unicode-ident",
]
[[package]]
name = "quote"
version = "1.0.41"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ce25767e7b499d1b604768e7cde645d14cc8584231ea6b295e9c9eb22c02e1d1"
dependencies = [
"proc-macro2",
]
[[package]]
name = "rayon"
version = "1.10.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b418a60154510ca1a002a752ca9714984e21e4241e804d32555251faf8b78ffa"
dependencies = [
"either",
"rayon-core",
]
[[package]]
name = "rayon-core"
version = "1.12.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1465873a3dfdaa8ae7cb14b4383657caab0b3e8a0aa9ae8e04b044854c8dfce2"
dependencies = [
"crossbeam-deque",
"crossbeam-utils",
]
[[package]]
name = "regex"
version = "1.12.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "843bc0191f75f3e22651ae5f1e72939ab2f72a4bc30fa80a066bd66edefc24d4"
dependencies = [
"aho-corasick",
"memchr",
"regex-automata",
"regex-syntax",
]
[[package]]
name = "regex-automata"
version = "0.4.13"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5276caf25ac86c8d810222b3dbb938e512c55c6831a10f3e6ed1c93b84041f1c"
dependencies = [
"aho-corasick",
"memchr",
"regex-syntax",
]
[[package]]
name = "regex-syntax"
version = "0.8.8"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7a2d987857b319362043e95f5353c0535c1f58eec5336fdfcf626430af7def58"
[[package]]
name = "rustversion"
version = "1.0.22"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b39cdef0fa800fc44525c84ccb54a029961a8215f9619753635a9c0d2538d46d"
[[package]]
name = "ryu"
version = "1.0.20"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "28d3b2b1366ec20994f1fd18c3c594f05c5dd4bc44d8bb0c1c632c8d6829481f"
[[package]]
name = "same-file"
version = "1.0.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "93fc1dc3aaa9bfed95e02e6eadabb4baf7e3078b0bd1b4d7b6b0b68378900502"
dependencies = [
"winapi-util",
]
[[package]]
name = "serde"
version = "1.0.228"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9a8e94ea7f378bd32cbbd37198a4a91436180c5bb472411e48b5ec2e2124ae9e"
dependencies = [
"serde_core",
"serde_derive",
]
[[package]]
name = "serde_core"
version = "1.0.228"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "41d385c7d4ca58e59fc732af25c3983b67ac852c1a25000afe1175de458b67ad"
dependencies = [
"serde_derive",
]
[[package]]
name = "serde_derive"
version = "1.0.228"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d540f220d3187173da220f885ab66608367b6574e925011a9353e4badda91d79"
dependencies = [
"proc-macro2",
"quote",
"syn",
]
[[package]]
name = "serde_json"
version = "1.0.145"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "402a6f66d8c709116cf22f558eab210f5a50187f702eb4d7e5ef38d9a7f1c79c"
dependencies = [
"itoa",
"memchr",
"ryu",
"serde",
"serde_core",
]
[[package]]
name = "smallvec"
version = "1.15.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "67b1b7a3b5fe4f1376887184045fcf45c69e92af734b7aaddc05fb777b6fbd03"
[[package]]
name = "stable_deref_trait"
version = "1.2.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6ce2be8dc25455e1f91df71bfa12ad37d7af1092ae736f3a6cd0e37bc7810596"
[[package]]
name = "syn"
version = "2.0.108"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "da58917d35242480a05c2897064da0a80589a2a0476c9a3f2fdc83b53502e917"
dependencies = [
"proc-macro2",
"quote",
"unicode-ident",
]
[[package]]
name = "synstructure"
version = "0.13.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "728a70f3dbaf5bab7f0c4b1ac8d7ae5ea60a4b5549c8a5914361c99147a709d2"
dependencies = [
"proc-macro2",
"quote",
"syn",
]
[[package]]
name = "thiserror"
version = "2.0.17"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f63587ca0f12b72a0600bcba1d40081f830876000bb46dd2337a3051618f4fc8"
dependencies = [
"thiserror-impl",
]
[[package]]
name = "thiserror-impl"
version = "2.0.17"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3ff15c8ecd7de3849db632e14d18d2571fa09dfc5ed93479bc4485c7a517c913"
dependencies = [
"proc-macro2",
"quote",
"syn",
]
[[package]]
name = "tinystr"
version = "0.8.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "42d3e9c45c09de15d06dd8acf5f4e0e399e85927b7f00711024eb7ae10fa4869"
dependencies = [
"displaydoc",
"serde_core",
"zerovec",
]
[[package]]
name = "tinytemplate"
version = "1.2.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "be4d6b5f19ff7664e8c98d03e2139cb510db9b0a60b55f8e8709b689d939b6bc"
dependencies = [
"serde",
"serde_json",
]
[[package]]
name = "typeid"
version = "1.0.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "bc7d623258602320d5c55d1bc22793b57daff0ec7efc270ea7d55ce1d5f5471c"
[[package]]
name = "typenum"
version = "1.19.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "562d481066bde0658276a35467c4af00bdc6ee726305698a55b86e61d7ad82bb"
[[package]]
name = "unicode-ident"
version = "1.0.20"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "462eeb75aeb73aea900253ce739c8e18a67423fadf006037cd3ff27e82748a06"
[[package]]
name = "utf16_iter"
version = "1.0.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c8232dd3cdaed5356e0f716d285e4b40b932ac434100fe9b7e0e8e935b9e6246"
[[package]]
name = "utf8_iter"
version = "1.0.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b6c140620e7ffbb22c2dee59cafe6084a59b5ffc27a8859a5f0d494b5d52b6be"
[[package]]
name = "walkdir"
version = "2.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "29790946404f91d9c5d06f9874efddea1dc06c5efe94541a7d6863108e3a5e4b"
dependencies = [
"same-file",
"winapi-util",
]
[[package]]
name = "wasm-bindgen"
version = "0.2.104"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c1da10c01ae9f1ae40cbfac0bac3b1e724b320abfcf52229f80b547c0d250e2d"
dependencies = [
"cfg-if",
"once_cell",
"rustversion",
"wasm-bindgen-macro",
"wasm-bindgen-shared",
]
[[package]]
name = "wasm-bindgen-backend"
version = "0.2.104"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "671c9a5a66f49d8a47345ab942e2cb93c7d1d0339065d4f8139c486121b43b19"
dependencies = [
"bumpalo",
"log",
"proc-macro2",
"quote",
"syn",
"wasm-bindgen-shared",
]
[[package]]
name = "wasm-bindgen-macro"
version = "0.2.104"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7ca60477e4c59f5f2986c50191cd972e3a50d8a95603bc9434501cf156a9a119"
dependencies = [
"quote",
"wasm-bindgen-macro-support",
]
[[package]]
name = "wasm-bindgen-macro-support"
version = "0.2.104"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9f07d2f20d4da7b26400c9f4a0511e6e0345b040694e8a75bd41d578fa4421d7"
dependencies = [
"proc-macro2",
"quote",
"syn",
"wasm-bindgen-backend",
"wasm-bindgen-shared",
]
[[package]]
name = "wasm-bindgen-shared"
version = "0.2.104"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "bad67dc8b2a1a6e5448428adec4c3e84c43e561d8c9ee8a9e5aabeb193ec41d1"
dependencies = [
"unicode-ident",
]
[[package]]
name = "web-sys"
version = "0.3.81"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9367c417a924a74cae129e6a2ae3b47fabb1f8995595ab474029da749a8be120"
dependencies = [
"js-sys",
"wasm-bindgen",
]
[[package]]
name = "winapi-util"
version = "0.1.11"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c2a7b1c03c876122aa43f3020e6c3c3ee5c05081c9a00739faf7503aeba10d22"
dependencies = [
"windows-sys",
]
[[package]]
name = "windows-link"
version = "0.2.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f0805222e57f7521d6a62e36fa9163bc891acd422f971defe97d64e70d0a4fe5"
[[package]]
name = "windows-sys"
version = "0.61.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ae137229bcbd6cdf0f7b80a31df61766145077ddf49416a728b02cb3921ff3fc"
dependencies = [
"windows-link",
]
[[package]]
name = "write16"
version = "1.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d1890f4022759daae28ed4fe62859b1236caebfc61ede2f63ed4e695f3f6d936"
dependencies = [
"arrayvec",
"smallvec",
]
[[package]]
name = "writeable"
version = "0.6.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9edde0db4769d2dc68579893f2306b26c6ecfbe0ef499b013d731b7b9247e0b9"
[[package]]
name = "yoke"
version = "0.8.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "72d6e5c6afb84d73944e5cedb052c4680d5657337201555f9f2a16b7406d4954"
dependencies = [
"stable_deref_trait",
"yoke-derive",
"zerofrom",
]
[[package]]
name = "yoke-derive"
version = "0.8.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b659052874eb698efe5b9e8cf382204678a0086ebf46982b79d6ca3182927e5d"
dependencies = [
"proc-macro2",
"quote",
"syn",
"synstructure",
]
[[package]]
name = "zerofrom"
version = "0.1.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "50cc42e0333e05660c3587f3bf9d0478688e15d870fab3346451ce7f8c9fbea5"
dependencies = [
"zerofrom-derive",
]
[[package]]
name = "zerofrom-derive"
version = "0.1.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d71e5d6e06ab090c67b5e44993ec16b72dcbaabc526db883a360057678b48502"
dependencies = [
"proc-macro2",
"quote",
"syn",
"synstructure",
]
[[package]]
name = "zerotrie"
version = "0.2.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2a59c17a5562d507e4b54960e8569ebee33bee890c70aa3fe7b97e85a9fd7851"
dependencies = [
"databake",
"displaydoc",
"litemap",
"serde_core",
"yoke",
"zerofrom",
"zerovec",
]
[[package]]
name = "zerovec"
version = "0.11.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6c28719294829477f525be0186d13efa9a3c602f7ec202ca9e353d310fb9a002"
dependencies = [
"databake",
"serde",
"yoke",
"zerofrom",
"zerovec-derive",
]
[[package]]
name = "zerovec-derive"
version = "0.11.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "eadce39539ca5cb3985590102671f2567e659fca9666581ad3411d59207951f3"
dependencies = [
"proc-macro2",
"quote",
"syn",
]

210
vendor/icu_normalizer/Cargo.toml vendored Normal file
View File

@@ -0,0 +1,210 @@
# THIS FILE IS AUTOMATICALLY GENERATED BY CARGO
#
# When uploading crates to the registry Cargo will automatically
# "normalize" Cargo.toml files for maximal compatibility
# with all versions of Cargo and also rewrite `path` dependencies
# to registry (e.g., crates.io) dependencies.
#
# If you are reading this file be aware that the original Cargo.toml
# will likely look very different (and much more reasonable).
# See Cargo.toml.orig for the original contents.
[package]
edition = "2021"
rust-version = "1.83"
name = "icu_normalizer"
version = "2.1.1"
authors = ["The ICU4X Project Developers"]
build = false
include = [
"data/**/*",
"src/**/*",
"examples/**/*",
"benches/**/*",
"tests/**/*",
"Cargo.toml",
"LICENSE",
"README.md",
"build.rs",
]
autolib = false
autobins = false
autoexamples = false
autotests = false
autobenches = false
description = "API for normalizing text into Unicode Normalization Forms"
homepage = "https://icu4x.unicode.org"
readme = "README.md"
categories = ["internationalization"]
license = "Unicode-3.0"
repository = "https://github.com/unicode-org/icu4x"
[package.metadata.docs.rs]
all-features = true
[features]
compiled_data = [
"dep:icu_normalizer_data",
"icu_properties?/compiled_data",
"icu_provider/baked",
]
datagen = [
"serde",
"dep:databake",
"icu_properties",
"icu_collections/databake",
"zerovec/databake",
"icu_properties?/datagen",
"icu_provider/export",
]
default = [
"compiled_data",
"utf8_iter",
"utf16_iter",
]
experimental = []
icu_properties = ["dep:icu_properties"]
serde = [
"dep:serde",
"icu_collections/serde",
"zerovec/serde",
"icu_properties?/serde",
"icu_provider/serde",
]
utf16_iter = [
"dep:utf16_iter",
"dep:write16",
]
utf8_iter = ["dep:utf8_iter"]
write16 = []
[lib]
name = "icu_normalizer"
path = "src/lib.rs"
[[test]]
name = "tests"
path = "tests/tests.rs"
[[bench]]
name = "bench"
path = "benches/bench.rs"
harness = false
required-features = [
"utf16_iter",
"utf8_iter",
]
[[bench]]
name = "canonical_composition"
path = "benches/canonical_composition.rs"
[[bench]]
name = "canonical_decomposition"
path = "benches/canonical_decomposition.rs"
[[bench]]
name = "composing_normalizer_nfc"
path = "benches/composing_normalizer_nfc.rs"
[[bench]]
name = "composing_normalizer_nfkc"
path = "benches/composing_normalizer_nfkc.rs"
[[bench]]
name = "decomposing_normalizer_nfd"
path = "benches/decomposing_normalizer_nfd.rs"
[[bench]]
name = "decomposing_normalizer_nfkd"
path = "benches/decomposing_normalizer_nfkd.rs"
[[bench]]
name = "utf16_throughput"
path = "benches/utf16_throughput.rs"
[dependencies.databake]
version = "0.2.0"
features = ["derive"]
optional = true
default-features = false
[dependencies.icu_collections]
version = "~2.1.1"
default-features = false
[dependencies.icu_normalizer_data]
version = "~2.1.1"
optional = true
default-features = false
[dependencies.icu_properties]
version = "~2.1.1"
optional = true
default-features = false
[dependencies.icu_provider]
version = "2.1.1"
default-features = false
[dependencies.serde]
version = "1.0.220"
features = [
"derive",
"alloc",
]
optional = true
default-features = false
[dependencies.smallvec]
version = "1.10.0"
default-features = false
[dependencies.utf16_iter]
version = "1.0.2"
optional = true
default-features = false
[dependencies.utf8_iter]
version = "1.0.2"
optional = true
default-features = false
[dependencies.write16]
version = "1.0.0"
features = ["alloc"]
optional = true
default-features = false
[dependencies.zerovec]
version = "0.11.3"
default-features = false
[dev-dependencies.arraystring]
version = "0.3.0"
[dev-dependencies.arrayvec]
version = "0.7.2"
default-features = false
[dev-dependencies.atoi]
version = "2.0.0"
[dev-dependencies.detone]
version = "1.0.0"
[dev-dependencies.write16]
version = "1.0.0"
features = [
"arrayvec",
"smallvec",
]
default-features = false
[target.'cfg(not(target_arch = "wasm32"))'.dev-dependencies.criterion]
version = "0.5.0"
[lints.rust.unexpected_cfgs]
level = "warn"
priority = 0
check-cfg = ["cfg(icu4x_unstable_fast_trie_only)"]

46
vendor/icu_normalizer/LICENSE vendored Normal file
View File

@@ -0,0 +1,46 @@
UNICODE LICENSE V3
COPYRIGHT AND PERMISSION NOTICE
Copyright © 2020-2024 Unicode, Inc.
NOTICE TO USER: Carefully read the following legal agreement. BY
DOWNLOADING, INSTALLING, COPYING OR OTHERWISE USING DATA FILES, AND/OR
SOFTWARE, YOU UNEQUIVOCALLY ACCEPT, AND AGREE TO BE BOUND BY, ALL OF THE
TERMS AND CONDITIONS OF THIS AGREEMENT. IF YOU DO NOT AGREE, DO NOT
DOWNLOAD, INSTALL, COPY, DISTRIBUTE OR USE THE DATA FILES OR SOFTWARE.
Permission is hereby granted, free of charge, to any person obtaining a
copy of data files and any associated documentation (the "Data Files") or
software and any associated documentation (the "Software") to deal in the
Data Files or Software without restriction, including without limitation
the rights to use, copy, modify, merge, publish, distribute, and/or sell
copies of the Data Files or Software, and to permit persons to whom the
Data Files or Software are furnished to do so, provided that either (a)
this copyright and permission notice appear with all copies of the Data
Files or Software, or (b) this copyright and permission notice appear in
associated Documentation.
THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY
KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF
THIRD PARTY RIGHTS.
IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS NOTICE
BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES,
OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THE DATA
FILES OR SOFTWARE.
Except as contained in this notice, the name of a copyright holder shall
not be used in advertising or otherwise to promote the sale, use or other
dealings in these Data Files or Software without prior written
authorization of the copyright holder.
SPDX-License-Identifier: Unicode-3.0
Portions of ICU4X may have been adapted from ICU4C and/or ICU4J.
ICU 1.8.1 to ICU 57.1 © 1995-2016 International Business Machines Corporation and others.

48
vendor/icu_normalizer/README.md vendored Normal file
View File

@@ -0,0 +1,48 @@
# icu_normalizer [![crates.io](https://img.shields.io/crates/v/icu_normalizer)](https://crates.io/crates/icu_normalizer)
<!-- cargo-rdme start -->
Normalizing text into Unicode Normalization Forms.
This module is published as its own crate ([`icu_normalizer`](https://docs.rs/icu_normalizer/latest/icu_normalizer/))
and as part of the [`icu`](https://docs.rs/icu/latest/icu/) crate. See the latter for more details on the ICU4X project.
## Functionality
The top level of the crate provides normalization of input into the four normalization forms defined in [UAX #15: Unicode
Normalization Forms](https://www.unicode.org/reports/tr15/): NFC, NFD, NFKC, and NFKD.
Three kinds of contiguous inputs are supported: known-well-formed UTF-8 (`&str`), potentially-not-well-formed UTF-8,
and potentially-not-well-formed UTF-16. Additionally, an iterator over `char` can be wrapped in a normalizing iterator.
The `uts46` module provides the combination of mapping and normalization operations for [UTS #46: Unicode IDNA
Compatibility Processing](https://www.unicode.org/reports/tr46/). This functionality is not meant to be used by
applications directly. Instead, it is meant as a building block for a full implementation of UTS #46, such as the
[`idna`](https://docs.rs/idna/latest/idna/) crate.
The `properties` module provides the non-recursive canonical decomposition operation on a per `char` basis and
the canonical compositon operation given two `char`s. It also provides access to the Canonical Combining Class
property. These operations are primarily meant for [HarfBuzz](https://harfbuzz.github.io/) via the
[`icu_harfbuzz`](https://docs.rs/icu_harfbuzz/latest/icu_harfbuzz/) crate.
Notably, this normalizer does _not_ provide the normalization “quick check” that can result in “maybe” in
addition to “yes” and “no”. The normalization checks provided by this crate always give a definitive
non-“maybe” answer.
## Examples
```rust
let nfc = icu_normalizer::ComposingNormalizerBorrowed::new_nfc();
assert_eq!(nfc.normalize("a\u{0308}"), "ä");
assert!(nfc.is_normalized("ä"));
let nfd = icu_normalizer::DecomposingNormalizerBorrowed::new_nfd();
assert_eq!(nfd.normalize("ä"), "a\u{0308}");
assert!(!nfd.is_normalized("ä"));
```
<!-- cargo-rdme end -->
## More Information
For more information on development, authorship, contributing etc. please visit [`ICU4X home page`](https://github.com/unicode-org/icu4x).

26
vendor/icu_normalizer/benches/bench.rs vendored Normal file
View File

@@ -0,0 +1,26 @@
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
use criterion::{criterion_group, criterion_main};
mod canonical_composition;
mod canonical_decomposition;
mod composing_normalizer_nfc;
mod composing_normalizer_nfkc;
mod decomposing_normalizer_nfd;
mod decomposing_normalizer_nfkd;
mod utf16_throughput;
criterion_group!(
benches,
canonical_composition::criterion_benchmark,
canonical_decomposition::criterion_benchmark,
composing_normalizer_nfc::criterion_benchmark,
composing_normalizer_nfkc::criterion_benchmark,
decomposing_normalizer_nfd::criterion_benchmark,
decomposing_normalizer_nfkd::criterion_benchmark,
utf16_throughput::criterion_benchmark,
);
criterion_main!(benches);

View File

@@ -0,0 +1,188 @@
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
use criterion::{black_box, BenchmarkId, Criterion};
use detone::IterDecomposeVietnamese;
use icu_normalizer::properties::{
CanonicalCompositionBorrowed, CanonicalDecompositionBorrowed, Decomposed,
};
use icu_normalizer::ComposingNormalizerBorrowed;
struct BenchDataContent {
pub file_name: String,
pub pairs: Vec<(char, char)>,
}
fn strip_headers(content: &str) -> String {
content
.lines()
.filter(|&s| !s.starts_with('#'))
.map(|s| s.to_owned())
.collect::<Vec<String>>()
.join("\n")
}
fn normalizer_bench_data() -> [BenchDataContent; 16] {
let nfc_normalizer = ComposingNormalizerBorrowed::new_nfc();
[
BenchDataContent {
file_name: "TestNames_Latin".to_owned(),
pairs: decompose_data(
&nfc_normalizer
.normalize(&strip_headers(include_str!("./data/TestNames_Latin.txt"))),
),
},
BenchDataContent {
file_name: "TestNames_Japanese_h".to_owned(),
pairs: decompose_data(&nfc_normalizer.normalize(&strip_headers(include_str!(
"./data/TestNames_Japanese_h.txt"
)))),
},
BenchDataContent {
file_name: "TestNames_Japanese_k".to_owned(),
pairs: decompose_data(&nfc_normalizer.normalize(&strip_headers(include_str!(
"./data/TestNames_Japanese_k.txt"
)))),
},
BenchDataContent {
file_name: "TestNames_Korean".to_owned(),
pairs: decompose_data(
&nfc_normalizer
.normalize(&strip_headers(include_str!("./data/TestNames_Korean.txt"))),
),
},
BenchDataContent {
file_name: "TestRandomWordsUDHR_ar".to_owned(),
#[cfg(debug_assertions)]
pairs: Vec::new(),
#[cfg(not(debug_assertions))]
pairs: decompose_data(&nfc_normalizer.normalize(&strip_headers(include_str!(
"./data/TestRandomWordsUDHR_ar.txt"
)))),
},
BenchDataContent {
file_name: "TestRandomWordsUDHR_de".to_owned(),
pairs: decompose_data(&nfc_normalizer.normalize(&strip_headers(include_str!(
"./data/TestRandomWordsUDHR_de.txt"
)))),
},
BenchDataContent {
file_name: "TestRandomWordsUDHR_el".to_owned(),
pairs: decompose_data(&nfc_normalizer.normalize(&strip_headers(include_str!(
"./data/TestRandomWordsUDHR_el.txt"
)))),
},
BenchDataContent {
file_name: "TestRandomWordsUDHR_es".to_owned(),
pairs: decompose_data(&nfc_normalizer.normalize(&strip_headers(include_str!(
"./data/TestRandomWordsUDHR_es.txt"
)))),
},
BenchDataContent {
file_name: "TestRandomWordsUDHR_fr".to_owned(),
pairs: decompose_data(&nfc_normalizer.normalize(&strip_headers(include_str!(
"./data/TestRandomWordsUDHR_fr.txt"
)))),
},
BenchDataContent {
file_name: "TestRandomWordsUDHR_he".to_owned(),
pairs: decompose_data(&nfc_normalizer.normalize(&strip_headers(include_str!(
"./data/TestRandomWordsUDHR_he.txt"
)))),
},
BenchDataContent {
file_name: "TestRandomWordsUDHR_pl".to_owned(),
pairs: decompose_data(&nfc_normalizer.normalize(&strip_headers(include_str!(
"./data/TestRandomWordsUDHR_pl.txt"
)))),
},
BenchDataContent {
file_name: "TestRandomWordsUDHR_ru".to_owned(),
pairs: decompose_data(&nfc_normalizer.normalize(&strip_headers(include_str!(
"./data/TestRandomWordsUDHR_ru.txt"
)))),
},
BenchDataContent {
file_name: "TestRandomWordsUDHR_th".to_owned(),
#[cfg(debug_assertions)]
pairs: Vec::new(),
#[cfg(not(debug_assertions))]
pairs: decompose_data(&nfc_normalizer.normalize(&strip_headers(include_str!(
"./data/TestRandomWordsUDHR_th.txt"
)))),
},
BenchDataContent {
file_name: "TestRandomWordsUDHR_tr".to_owned(),
pairs: decompose_data(&nfc_normalizer.normalize(&strip_headers(include_str!(
"./data/TestRandomWordsUDHR_tr.txt"
)))),
},
BenchDataContent {
file_name: "udhr_vie".to_owned(),
pairs: decompose_data(
&nfc_normalizer.normalize(&strip_headers(include_str!("data/wotw.txt"))),
),
},
BenchDataContent {
file_name: "udhr_vie_detone".to_owned(),
pairs: {
let result: Vec<(char, char)> = nfc_normalizer
.normalize(&strip_headers(include_str!("data/wotw.txt")))
.chars()
.filter_map(|c| {
let mut iter = std::iter::once(c).decompose_vietnamese_tones(true);
if let Some(base) = iter.next() {
iter.next().map(|tone| (base, tone))
} else {
None
}
})
.collect();
assert!(!result.is_empty());
result
},
},
]
}
fn function_under_bench(
canonical_composer: &CanonicalCompositionBorrowed,
composable_points: &[(char, char)],
) {
for pair in composable_points.iter() {
canonical_composer.compose(pair.0, pair.1);
}
}
pub fn criterion_benchmark(criterion: &mut Criterion) {
let group_name = "canonical_composition";
let mut group = criterion.benchmark_group(group_name);
let composer = CanonicalCompositionBorrowed::new();
for bench_data_content in black_box(normalizer_bench_data()) {
group.bench_function(
BenchmarkId::from_parameter(format!("from_nfc_{}", bench_data_content.file_name)),
|bencher| bencher.iter(|| function_under_bench(&composer, &bench_data_content.pairs)),
);
}
group.finish();
}
fn decompose_data(nfc: &str) -> Vec<(char, char)> {
let decomposer = CanonicalDecompositionBorrowed::new();
nfc.chars()
.map(|c| decomposer.decompose(c))
.filter_map(|decomposed| {
if let Decomposed::Expansion(a, b) = decomposed {
Some((a, b))
} else {
None
}
})
.collect()
}

View File

@@ -0,0 +1,162 @@
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
use criterion::{black_box, BenchmarkId, Criterion};
use icu_normalizer::properties::CanonicalDecompositionBorrowed;
use icu_normalizer::{ComposingNormalizerBorrowed, DecomposingNormalizerBorrowed};
struct BenchDataContent {
pub file_name: String,
pub nfc: String,
pub nfd: String,
pub nfkc: String,
pub nfkd: String,
}
fn strip_headers(content: &str) -> String {
content
.lines()
.filter(|&s| !s.starts_with('#'))
.map(|s| s.to_owned())
.collect::<Vec<String>>()
.join("\n")
}
fn normalizer_bench_data() -> [BenchDataContent; 15] {
let nfc_normalizer = ComposingNormalizerBorrowed::new_nfc();
let nfd_normalizer = DecomposingNormalizerBorrowed::new_nfd();
let nfkc_normalizer = ComposingNormalizerBorrowed::new_nfkc();
let nfkd_normalizer = DecomposingNormalizerBorrowed::new_nfkd();
let content_latin: (&str, &str) = (
"TestNames_Latin",
&strip_headers(include_str!("./data/TestNames_Latin.txt")),
);
let content_jp_h: (&str, &str) = (
"TestNames_Japanese_h",
&strip_headers(include_str!("./data/TestNames_Japanese_h.txt")),
);
let content_jp_k: (&str, &str) = (
"TestNames_Japanese_k",
&strip_headers(include_str!("./data/TestNames_Japanese_k.txt")),
);
let content_korean: (&str, &str) = (
"TestNames_Korean",
&strip_headers(include_str!("./data/TestNames_Korean.txt")),
);
let content_random_words_ar: (&str, &str) = (
"TestRandomWordsUDHR_ar",
&strip_headers(include_str!("./data/TestRandomWordsUDHR_ar.txt")),
);
let content_random_words_de: (&str, &str) = (
"TestRandomWordsUDHR_de",
&strip_headers(include_str!("./data/TestRandomWordsUDHR_de.txt")),
);
let content_random_words_el: (&str, &str) = (
"TestRandomWordsUDHR_el",
&strip_headers(include_str!("./data/TestRandomWordsUDHR_el.txt")),
);
let content_random_words_es: (&str, &str) = (
"TestRandomWordsUDHR_es",
&strip_headers(include_str!("./data/TestRandomWordsUDHR_es.txt")),
);
let content_random_words_fr: (&str, &str) = (
"TestRandomWordsUDHR_fr",
&strip_headers(include_str!("./data/TestRandomWordsUDHR_fr.txt")),
);
let content_random_words_he: (&str, &str) = (
"TestRandomWordsUDHR_he",
&strip_headers(include_str!("./data/TestRandomWordsUDHR_he.txt")),
);
let content_random_words_pl: (&str, &str) = (
"TestRandomWordsUDHR_pl",
&strip_headers(include_str!("./data/TestRandomWordsUDHR_pl.txt")),
);
let content_random_words_ru: (&str, &str) = (
"TestRandomWordsUDHR_ru",
&strip_headers(include_str!("./data/TestRandomWordsUDHR_ru.txt")),
);
let content_random_words_th: (&str, &str) = (
"TestRandomWordsUDHR_th",
&strip_headers(include_str!("./data/TestRandomWordsUDHR_th.txt")),
);
let content_random_words_tr: (&str, &str) = (
"TestRandomWordsUDHR_tr",
&strip_headers(include_str!("./data/TestRandomWordsUDHR_tr.txt")),
);
let content_viet: (&str, &str) = ("udhr_vie", &strip_headers(include_str!("data/wotw.txt")));
[
content_latin,
content_viet,
content_jp_k,
content_jp_h,
content_korean,
content_random_words_ru,
content_random_words_ar,
content_random_words_el,
content_random_words_es,
content_random_words_fr,
content_random_words_tr,
content_random_words_th,
content_random_words_pl,
content_random_words_he,
content_random_words_de,
]
.map(|(file_name, raw_content)| BenchDataContent {
file_name: file_name.to_owned(),
nfc: nfc_normalizer.normalize(raw_content).to_string(),
nfd: nfd_normalizer.normalize(raw_content).to_string(),
nfkc: nfkc_normalizer.normalize(raw_content).to_string(),
nfkd: nfkd_normalizer.normalize(raw_content).to_string(),
})
}
#[cfg(debug_assertions)]
fn function_under_bench(
_canonical_decomposer: &CanonicalDecompositionBorrowed,
_decomposable_points: &str,
) {
// using debug assertion fails some test.
// "cargo test --bench bench" will pass
// "cargo bench" will work as expected, because the profile doesn't include debug assertions.
}
#[cfg(not(debug_assertions))]
fn function_under_bench(
canonical_decomposer: &CanonicalDecompositionBorrowed,
decomposable_points: &str,
) {
decomposable_points.chars().for_each(|point| {
canonical_decomposer.decompose(point);
});
}
pub fn criterion_benchmark(criterion: &mut Criterion) {
let group_name = "canonical_decomposition";
let mut group = criterion.benchmark_group(group_name);
let decomposer = CanonicalDecompositionBorrowed::new();
for bench_data_content in black_box(normalizer_bench_data()) {
group.bench_function(
BenchmarkId::from_parameter(format!("from_nfc_{}", bench_data_content.file_name)),
|bencher| bencher.iter(|| function_under_bench(&decomposer, &bench_data_content.nfc)),
);
group.bench_function(
BenchmarkId::from_parameter(format!("from_nfd_{}", bench_data_content.file_name)),
|bencher| bencher.iter(|| function_under_bench(&decomposer, &bench_data_content.nfd)),
);
group.bench_function(
BenchmarkId::from_parameter(format!("from_nfkc_{}", bench_data_content.file_name)),
|bencher| bencher.iter(|| function_under_bench(&decomposer, &bench_data_content.nfkc)),
);
group.bench_function(
BenchmarkId::from_parameter(format!("from_nfkd_{}", bench_data_content.file_name)),
|bencher| bencher.iter(|| function_under_bench(&decomposer, &bench_data_content.nfkd)),
);
}
group.finish();
}

View File

@@ -0,0 +1,230 @@
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
use criterion::{black_box, BenchmarkId, Criterion};
use icu_normalizer::{ComposingNormalizerBorrowed, DecomposingNormalizerBorrowed};
struct BenchDataContent {
pub file_name: String,
pub nfc: String,
pub nfd: String,
pub nfkc: String,
pub nfkd: String,
pub nfc_u16: Vec<u16>,
pub nfd_u16: Vec<u16>,
pub nfkc_u16: Vec<u16>,
pub nfkd_u16: Vec<u16>,
}
fn strip_headers(content: &str) -> String {
content
.lines()
.filter(|&s| !s.starts_with('#'))
.map(|s| s.to_owned())
.collect::<Vec<String>>()
.join("\n")
}
fn normalizer_bench_data() -> [BenchDataContent; 15] {
let nfc_normalizer = ComposingNormalizerBorrowed::new_nfc();
let nfd_normalizer = DecomposingNormalizerBorrowed::new_nfd();
let nfkc_normalizer = ComposingNormalizerBorrowed::new_nfkc();
let nfkd_normalizer = DecomposingNormalizerBorrowed::new_nfkd();
let content_latin: (&str, &str) = (
"TestNames_Latin",
&strip_headers(include_str!("./data/TestNames_Latin.txt")),
);
let content_jp_h: (&str, &str) = (
"TestNames_Japanese_h",
&strip_headers(include_str!("./data/TestNames_Japanese_h.txt")),
);
let content_jp_k: (&str, &str) = (
"TestNames_Japanese_k",
&strip_headers(include_str!("./data/TestNames_Japanese_k.txt")),
);
let content_korean: (&str, &str) = (
"TestNames_Korean",
&strip_headers(include_str!("./data/TestNames_Korean.txt")),
);
let content_random_words_ar: (&str, &str) = (
"TestRandomWordsUDHR_ar",
&strip_headers(include_str!("./data/TestRandomWordsUDHR_ar.txt")),
);
let content_random_words_de: (&str, &str) = (
"TestRandomWordsUDHR_de",
&strip_headers(include_str!("./data/TestRandomWordsUDHR_de.txt")),
);
let content_random_words_el: (&str, &str) = (
"TestRandomWordsUDHR_el",
&strip_headers(include_str!("./data/TestRandomWordsUDHR_el.txt")),
);
let content_random_words_es: (&str, &str) = (
"TestRandomWordsUDHR_es",
&strip_headers(include_str!("./data/TestRandomWordsUDHR_es.txt")),
);
let content_random_words_fr: (&str, &str) = (
"TestRandomWordsUDHR_fr",
&strip_headers(include_str!("./data/TestRandomWordsUDHR_fr.txt")),
);
let content_random_words_he: (&str, &str) = (
"TestRandomWordsUDHR_he",
&strip_headers(include_str!("./data/TestRandomWordsUDHR_he.txt")),
);
let content_random_words_pl: (&str, &str) = (
"TestRandomWordsUDHR_pl",
&strip_headers(include_str!("./data/TestRandomWordsUDHR_pl.txt")),
);
let content_random_words_ru: (&str, &str) = (
"TestRandomWordsUDHR_ru",
&strip_headers(include_str!("./data/TestRandomWordsUDHR_ru.txt")),
);
let content_random_words_th: (&str, &str) = (
"TestRandomWordsUDHR_th",
&strip_headers(include_str!("./data/TestRandomWordsUDHR_th.txt")),
);
let content_random_words_tr: (&str, &str) = (
"TestRandomWordsUDHR_tr",
&strip_headers(include_str!("./data/TestRandomWordsUDHR_tr.txt")),
);
let content_viet: (&str, &str) = ("wotw", &strip_headers(include_str!("./data/wotw.txt")));
[
content_latin,
content_viet,
content_jp_k,
content_jp_h,
content_korean,
content_random_words_ru,
content_random_words_ar,
content_random_words_el,
content_random_words_es,
content_random_words_fr,
content_random_words_tr,
content_random_words_th,
content_random_words_pl,
content_random_words_he,
content_random_words_de,
]
.map(|(file_name, raw_content)| {
let nfc = &nfc_normalizer.normalize(raw_content);
let nfd = &nfd_normalizer.normalize(raw_content);
let nfkc = &nfkc_normalizer.normalize(raw_content);
let nfkd = &nfkd_normalizer.normalize(raw_content);
BenchDataContent {
file_name: file_name.to_owned(),
nfc: nfc.to_string(),
nfd: nfd.to_string(),
nfkc: nfkc.to_string(),
nfkd: nfkd.to_string(),
nfc_u16: nfc.encode_utf16().collect(),
nfd_u16: nfd.encode_utf16().collect(),
nfkc_u16: nfkc.encode_utf16().collect(),
nfkd_u16: nfkd.encode_utf16().collect(),
}
})
}
fn function_under_bench(normalizer: &ComposingNormalizerBorrowed, text: &str) {
normalizer.normalize(text);
}
fn function_under_bench_utf16(normalizer: &ComposingNormalizerBorrowed, text: &[u16]) {
normalizer.normalize_utf16(text);
}
pub fn criterion_benchmark(criterion: &mut Criterion) {
let group_name = "composing_normalizer_nfc";
let normalizer_under_bench = ComposingNormalizerBorrowed::new_nfc();
let mut group = criterion.benchmark_group(group_name);
for bench_data_content in black_box(normalizer_bench_data()) {
group.bench_function(
BenchmarkId::from_parameter(format!("from_nfc_{}", bench_data_content.file_name)),
|bencher| {
bencher
.iter(|| function_under_bench(&normalizer_under_bench, &bench_data_content.nfc))
},
);
group.bench_function(
BenchmarkId::from_parameter(format!("from_nfd_{}", bench_data_content.file_name)),
|bencher| {
bencher
.iter(|| function_under_bench(&normalizer_under_bench, &bench_data_content.nfd))
},
);
group.bench_function(
BenchmarkId::from_parameter(format!("from_nfkc_{}", bench_data_content.file_name)),
|bencher| {
bencher.iter(|| {
function_under_bench(&normalizer_under_bench, &bench_data_content.nfkc)
})
},
);
group.bench_function(
BenchmarkId::from_parameter(format!("from_nfkd_{}", bench_data_content.file_name)),
|bencher| {
bencher.iter(|| {
function_under_bench(&normalizer_under_bench, &bench_data_content.nfkd)
})
},
);
// UTF_16
group.bench_function(
BenchmarkId::from_parameter(format!(
"from_nfc_{}_utf_16",
bench_data_content.file_name
)),
|bencher| {
bencher.iter(|| {
function_under_bench_utf16(&normalizer_under_bench, &bench_data_content.nfc_u16)
})
},
);
group.bench_function(
BenchmarkId::from_parameter(format!(
"from_nfd_{}_utf_16",
bench_data_content.file_name
)),
|bencher| {
bencher.iter(|| {
function_under_bench_utf16(&normalizer_under_bench, &bench_data_content.nfd_u16)
})
},
);
group.bench_function(
BenchmarkId::from_parameter(format!(
"from_nfkc_{}_utf_16",
bench_data_content.file_name
)),
|bencher| {
bencher.iter(|| {
function_under_bench_utf16(
&normalizer_under_bench,
&bench_data_content.nfkc_u16,
)
})
},
);
group.bench_function(
BenchmarkId::from_parameter(format!(
"from_nfkd_{}_utf_16",
bench_data_content.file_name
)),
|bencher| {
bencher.iter(|| {
function_under_bench_utf16(
&normalizer_under_bench,
&bench_data_content.nfkd_u16,
)
})
},
);
}
group.finish();
}

View File

@@ -0,0 +1,211 @@
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
use criterion::{black_box, BenchmarkId, Criterion};
use icu_normalizer::{ComposingNormalizerBorrowed, DecomposingNormalizerBorrowed};
struct BenchDataContent {
pub file_name: String,
pub nfc: String,
pub nfd: String,
pub nfkc: String,
pub nfkd: String,
pub nfc_u16: Vec<u16>,
pub nfd_u16: Vec<u16>,
pub nfkc_u16: Vec<u16>,
pub nfkd_u16: Vec<u16>,
}
fn strip_headers(content: &str) -> String {
content
.lines()
.filter(|&s| !s.starts_with('#'))
.map(|s| s.to_owned())
.collect::<Vec<String>>()
.join("\n")
}
fn normalizer_bench_data() -> [BenchDataContent; 15] {
let nfc_normalizer = ComposingNormalizerBorrowed::new_nfc();
let nfd_normalizer = DecomposingNormalizerBorrowed::new_nfd();
let nfkc_normalizer = ComposingNormalizerBorrowed::new_nfkc();
let nfkd_normalizer = DecomposingNormalizerBorrowed::new_nfkd();
let content_latin: (&str, &str) = (
"TestNames_Latin",
&strip_headers(include_str!("./data/TestNames_Latin.txt")),
);
let content_jp_h: (&str, &str) = (
"TestNames_Japanese_h",
&strip_headers(include_str!("./data/TestNames_Japanese_h.txt")),
);
let content_jp_k: (&str, &str) = (
"TestNames_Japanese_k",
&strip_headers(include_str!("./data/TestNames_Japanese_k.txt")),
);
let content_korean: (&str, &str) = (
"TestNames_Korean",
&strip_headers(include_str!("./data/TestNames_Korean.txt")),
);
let content_random_words_ar: (&str, &str) = (
"TestRandomWordsUDHR_ar",
&strip_headers(include_str!("./data/TestRandomWordsUDHR_ar.txt")),
);
let content_random_words_de: (&str, &str) = (
"TestRandomWordsUDHR_de",
&strip_headers(include_str!("./data/TestRandomWordsUDHR_de.txt")),
);
let content_random_words_el: (&str, &str) = (
"TestRandomWordsUDHR_el",
&strip_headers(include_str!("./data/TestRandomWordsUDHR_el.txt")),
);
let content_random_words_es: (&str, &str) = (
"TestRandomWordsUDHR_es",
&strip_headers(include_str!("./data/TestRandomWordsUDHR_es.txt")),
);
let content_random_words_fr: (&str, &str) = (
"TestRandomWordsUDHR_fr",
&strip_headers(include_str!("./data/TestRandomWordsUDHR_fr.txt")),
);
let content_random_words_he: (&str, &str) = (
"TestRandomWordsUDHR_he",
&strip_headers(include_str!("./data/TestRandomWordsUDHR_he.txt")),
);
let content_random_words_pl: (&str, &str) = (
"TestRandomWordsUDHR_pl",
&strip_headers(include_str!("./data/TestRandomWordsUDHR_pl.txt")),
);
let content_random_words_ru: (&str, &str) = (
"TestRandomWordsUDHR_ru",
&strip_headers(include_str!("./data/TestRandomWordsUDHR_ru.txt")),
);
let content_random_words_th: (&str, &str) = (
"TestRandomWordsUDHR_th",
&strip_headers(include_str!("./data/TestRandomWordsUDHR_th.txt")),
);
let content_random_words_tr: (&str, &str) = (
"TestRandomWordsUDHR_tr",
&strip_headers(include_str!("./data/TestRandomWordsUDHR_tr.txt")),
);
let content_viet: (&str, &str) = ("wotw", &strip_headers(include_str!("./data/wotw.txt")));
[
content_latin,
content_viet,
content_jp_k,
content_jp_h,
content_korean,
content_random_words_ru,
content_random_words_ar,
content_random_words_el,
content_random_words_es,
content_random_words_fr,
content_random_words_tr,
content_random_words_th,
content_random_words_pl,
content_random_words_he,
content_random_words_de,
]
.map(|(file_name, raw_content)| {
let nfc = &nfc_normalizer.normalize(raw_content);
let nfd = &nfd_normalizer.normalize(raw_content);
let nfkc = &nfkc_normalizer.normalize(raw_content);
let nfkd = &nfkd_normalizer.normalize(raw_content);
BenchDataContent {
file_name: file_name.to_owned(),
nfc: nfc.to_string(),
nfd: nfd.to_string(),
nfkc: nfkc.to_string(),
nfkd: nfkd.to_string(),
nfc_u16: nfc.encode_utf16().collect(),
nfd_u16: nfd.encode_utf16().collect(),
nfkc_u16: nfkc.encode_utf16().collect(),
nfkd_u16: nfkd.encode_utf16().collect(),
}
})
}
fn function_under_bench(normalizer: &ComposingNormalizerBorrowed, text: &str) {
normalizer.normalize(text);
}
fn function_under_bench_u16(normalizer: &ComposingNormalizerBorrowed, text: &[u16]) {
normalizer.normalize_utf16(text);
}
pub fn criterion_benchmark(criterion: &mut Criterion) {
let group_name = "composing_normalizer_nfkc";
let normalizer_under_bench = ComposingNormalizerBorrowed::new_nfkc();
let mut group = criterion.benchmark_group(group_name);
for bench_data_content in black_box(normalizer_bench_data()) {
group.bench_function(
BenchmarkId::from_parameter(format!("from_nfc_{}", bench_data_content.file_name)),
|bencher| {
bencher
.iter(|| function_under_bench(&normalizer_under_bench, &bench_data_content.nfc))
},
);
group.bench_function(
BenchmarkId::from_parameter(format!("from_nfd_{}", bench_data_content.file_name)),
|bencher| {
bencher
.iter(|| function_under_bench(&normalizer_under_bench, &bench_data_content.nfd))
},
);
group.bench_function(
BenchmarkId::from_parameter(format!("from_nfkc_{}", bench_data_content.file_name)),
|bencher| {
bencher.iter(|| {
function_under_bench(&normalizer_under_bench, &bench_data_content.nfkc)
})
},
);
group.bench_function(
BenchmarkId::from_parameter(format!("from_nfkd_{}", bench_data_content.file_name)),
|bencher| {
bencher.iter(|| {
function_under_bench(&normalizer_under_bench, &bench_data_content.nfkd)
})
},
);
// UTF 16
group.bench_function(
BenchmarkId::from_parameter(format!("from_nfc_{}_u16", bench_data_content.file_name)),
|bencher| {
bencher.iter(|| {
function_under_bench_u16(&normalizer_under_bench, &bench_data_content.nfc_u16)
})
},
);
group.bench_function(
BenchmarkId::from_parameter(format!("from_nfd_{}_u16", bench_data_content.file_name)),
|bencher| {
bencher.iter(|| {
function_under_bench_u16(&normalizer_under_bench, &bench_data_content.nfd_u16)
})
},
);
group.bench_function(
BenchmarkId::from_parameter(format!("from_nfkc_{}_u16", bench_data_content.file_name)),
|bencher| {
bencher.iter(|| {
function_under_bench_u16(&normalizer_under_bench, &bench_data_content.nfkc_u16)
})
},
);
group.bench_function(
BenchmarkId::from_parameter(format!("from_nfkd_{}_u16", bench_data_content.file_name)),
|bencher| {
bencher.iter(|| {
function_under_bench_u16(&normalizer_under_bench, &bench_data_content.nfkd_u16)
})
},
);
}
group.finish();
}

View File

@@ -0,0 +1,25 @@
# Generating microbench data
The full versions of these files are located
[in another part of the repository](https://github.com/unicode-org/icu/tree/main/icu4j/perf-tests/data).
## Sanitizing the file
```shell
sed -i '/^#/d' ${filename}
sed -i '/^$/d' ${filename}
```
## Shuffling the file
```shell
shuf -n 20 ${filename} -o ${filename}
```
## Add back the header (if you plan on submitting the files)
```
# This file is part of ICU4X. For terms of use, please see the file
# called LICENSE at the top level of the ICU4X source tree
# (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
```

View File

@@ -0,0 +1,54 @@
# This file is part of ICU4X. For terms of use, please see the file
# called LICENSE at the top level of the ICU4X source tree
# (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
かげやま,みのる
むらかみ,とおる
つじさわ,けい
やすい,たかゆき
むらさき,としお
はせがわ,ひであき
うるしばら,よしひこ
ままだ,ひろし
おおぼら,えいじろう
おおば,まさひで
きたばたけ,たかひこ
はまさき,あつし
ほりい,つねお
もり,だいいち
いとう,しんいち
くにもと,じゅんじ
おか,のりひと
たに,よしあき
しらがき,ひろあき
しらはま,たけひろ
むらかみ,やすひろ
うめはら,たかし
いわた,ひろし
すぎえ,かつとし
てらにし,ひろみつ
まつおか,だいすけ
もろほし,すすむ
いしはら,たかし
おしま,ひろお
なかお,ゆうじ
いかり,はるお
きまち,まさき
ふるかわ,みちお
かねこ,しゅうへい
なかがわ,ともみ
ささき,しんご
うちだ,たくじ
うめだ,さかえ
しばた,いくこ
まきした,けいこ
まつもと,しんいちろう
たかの,かずよし
いしわた,なおひさ
いうち,まこと
いまい,りほ
みずた,のりあき
かくたに,まなぶ
わだ,ほまれ
わかまつ,かずき
かわぐち,ひろき

View File

@@ -0,0 +1,54 @@
# This file is part of ICU4X. For terms of use, please see the file
# called LICENSE at the top level of the ICU4X source tree
# (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
ホリモト,ユウジ
ハナミ,ヤスヒデ
イシザカ,タカユキ
ゼンケ,トシオ
ハトリ,ユウコ
ナガオカ,トモユキ
コウダ,ケンイチ
イシダ,ヒロシ
ミワ,シゲユキ
イシカワ,ヒロシ
スズキ,ユウスケ
オクダ,ヨシノリ
シムラ,サカエ
エビシマ,ヤスユキ
イブカ,ヨシテル
タノ,マコト
ドウゾノ,セイヤ
ヤマナカ,サツミ
トミイエ,ハヤト
アザミ,ツトム
タナカ,キョウコ
コジマ,アツシ
フミハラ,カオリ
スズキ,マサユキ
ナトリ,ケンヤ
スズキ,ユウコ
スズキ,ヒサエ
ナカガワ,カツヨシ
スズキ,マサフミ
マツヤマ,トシオ
ヨシナガ,チカエ
キタムラ,リカコ
アオキ,タクオ
ヤマグチ,ヤスヒロ
スギムラ,シゲオ
ウエスギ,マサミ
マツムラ,シンイチ
クバ,タカシ
スドウ,タカトシ
フジモト,ヒロシ
イトウ,シュウイチ
コバヤシ,カズミ
タナカ,ヒロカツ
イシダ,ツカサ
ヤマダ,マサコ
カミヤ,トミエ
タケモト,ユウジ
スミノ,コウジ
ヒロハタ,タクヤ
ミヒラ,リョウヘイ

View File

@@ -0,0 +1,54 @@
# This file is part of ICU4X. For terms of use, please see the file
# called LICENSE at the top level of the ICU4X source tree
# (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
김명희
홍차수
허순재
강영휘
김운주
이종환
이은국
강태호
강일래
김동현
곽기자
차재수
표봉기
문대원
이형기
최교표
박식현
홍종립
서창수
김쌍건
서말도
이병훈
김희수
박학태
강태종
조문란
신범균
백두진
이철정
김태중
이성현
김주조
김강행
이정길
김완일
권수자
이춘철
김판근
김곡리
이경형
이운만
손상철
유기숙
박정한
조윤래
유신호
이두수
김재률
김성홍
김혜경

View File

@@ -0,0 +1,54 @@
# This file is part of ICU4X. For terms of use, please see the file
# called LICENSE at the top level of the ICU4X source tree
# (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
González, Joan
Reinders, Jim
Applebroog, Ida
Kidd, Joseph Bartholomew
Gulácsy, Lajos
Letendre, Rita
Zuccaro, Federico
Apt the Elder, Ulrich
Drummond, Arthur
Manley, Thomas
Broc, Jean
Ramunno, Tony
Simone dei Crocifissi
Lane, Theodore
Symonds, William Robert
Johnson, Frank Tenney
Cox, Gardner
Bunbury, Charles
Pedro de la Cuadra
Payne, William
Lucas, John Seymour
Holsman, Elizabeth T.
de Vries, Auke
Laszlo, Philip Alexius de
Shigemasa
Wolfe, Ruth Mitchell
Buck, John
Baselitz, Georg
Hook, Walter
Segall, Lasar
Brush, George deForest
Master of Jánosrét
Sutherland, Elizabeth Leveson-Gower, Countess of
Tuckerman, Jane
Varley, F.H.
Fosso, Samuel
Gardner, Daniel
Sadler, Walter Dendy
Clausen, Franciska
Coman, Charlotte Buell
Wakelin, Roland
Payne, Jon, CML
Campagna, Girolamo
Wiener, Phyllis
Sallee, Charles
Fitzgerald, John Anster
Gribbroek, Robert
Laporte, John
Lévy-Dhurmer, Lucien
Young, Stephen Scott

View File

@@ -0,0 +1,54 @@
# This file is part of ICU4X. For terms of use, please see the file
# called LICENSE at the top level of the ICU4X source tree
# (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
ณรงค์ โต๊ะเงิน
กิตติ บุญวันต์
สมหมาย ดาบทองดี
ธวัชชัย อิสระนิมิตร
วรรณา โสภณนรินทร์
วินัย หมู่มิ่ง
พัชรี ชูจิรวงศ์
สมปอง จิวไพโรจน์กิจ
บุญส่ง กวยรักษา
นิพนธ์ นิ่มใหม่
พัชรี สุวพรศิลป์
เจริญ นววัฒนทรัพย์
อรพินท์ แซ่เจี่ย
ชัยพร สมใจนึก
ประนอม โคศิลา
ฉวีวรรณ ศรสังข์ทอง
วัชรา เจริญรัตนพร
สุภัท นกศิริ
อู๋ มาลาเล็ก
ประยูร ไชโย
ละออ อยู่ยืนยง
สมใจ วิวัฒน์วานิช
จุมพล จันทรศรีเกษร
พุฒ ดอกไม้จีน
บุญชัย วรกิจพรสิน
สมาน ธูปเทียน
พงศ์ศักดิ์ แซ่แต้
อำนาจ ไวจงเจริญ
พรทิพย์ แซ่ลี้
อุไรวรรณ สาครสินธุ์
อำพล วีระตะนนท์
สมจิตร ใจวังโลก
สุเทพ ตันวินิจ
สวาท ทรัพย์มาก
สมศักดิ์ เจือจันทร์
ดัสซันซิงห์ กุลาตี
ธีร ศรแก้ว
พรรณยุพา ฮ่อสกุล
สำราญ จันทร์เอี่ยม
พจน์ มั่นกันนาน
สุธี บุณยเกียรติ
บุญโชติ ทิพย์ประเสริฐสิน
ประดิษฐ์ ทองพสิฐสมบัติ
จำเนียร เพ็งเจริญ
สมศักดิ์ อรุณรัตน์
อนุชา จารุหิรัญสกุล
พิกุล มโนภิญโญภิญญะ
ผ่องศรี นกแก้ว
อารี วิไลวรรณ
ณรงค์วิทย์ วิทสัทธาวรกุล

View File

@@ -0,0 +1,54 @@
# This file is part of ICU4X. For terms of use, please see the file
# called LICENSE at the top level of the ICU4X source tree
# (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
ممارسة مراعاة
العنصرية
حدود والشيخوخة
بالحكم كهذا ينتفع
البلاد
تربية
الغير التقدم والعدل
نحو بالتعليم والحرية
تأمين متساو
للتعليم فيها
آذت اعتداء للتعليم
ليس المتأصلة
والمساهمة الضروري تتناقض
وتأسيس
رضى
شرعي الطبية
لكيلا الجمعية والحرية
للرجال التزوج
بالكرامة
حرية بين
هذه العيش تنظر
قيد
يقررها والصداقة
اعتُمد وينبغي اجتماعي
حرمان
للإدراك بأجر إنتاجه
التربية القانون
لإنصافه وتأسيس وسمعته
أساسه للرجال
كافة
المجهود دولي أينما
وإلى
بنشاط تجري
والأمم مثل لحقوق
الإنسان بشروط بحماية
شرفه
كما الوظائف
حياته ديسمبر
ولما
هذه
غاية جديد إنسان
حرية
متهم الوطنية قدمًا
التملك وضع
شرعية ويعبر تأدية
بنظام عمل والأخلاق
التملك لشخصيته يلجأ
بحال يضطر ولا
الانضمام بالكرامة
عضوا

View File

@@ -0,0 +1,54 @@
# This file is part of ICU4X. For terms of use, please see the file
# called LICENSE at the top level of the ICU4X source tree
# (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
Herrschaft Freiheiten Not
Gewalt
stets anderer begründet
erhobenen innerstaatliche
Heiratsfähige freie
offenstehen Begrenzung grausamer
Maßnahmen höchste
unentbehrlich privat
erniedrigender
Verachtung freie
innezuhaben innerstaatlichen
kommen
werden gleichgültig
Würde überall höchste
Schutzmaßnahmen den Pflichten
Wille Bestimmung
Leibeigenschaft einschließlich für
gleiche bekräftigt Gewissens
Wohles
Generalversammlung
Volkes
Völkern gegenwärtig Zusammenarbeit
Heiratsfähige sowie Jeder
Stellung
Lebensstandard
seinem
Rede strafbaren Sicherheit
mit
Kulthandlungen Grund
ärztlicher
Auflösung Anforderungen anzugehören
Furcht
keine Geburt
Wohles Furcht genügen
befriedigende Medien
anzugehören Urlaub Vereinigungen
hinzuwirken verboten Resolution
kommen
sozialer vor irgendein
Bestimmung Bestimmung
Fall natürliche kein
Geschlecht Aufhetzung eigenen
seinen
über
Unterlassung Berücksichtigung
war
Rufes stets
Volkes anderer Beschränkungen
Handlungen dessen
Die

View File

@@ -0,0 +1,54 @@
# This file is part of ICU4X. For terms of use, please see the file
# called LICENSE at the top level of the ICU4X source tree
# (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
προάγει αλληλογραφία
λογική έχει
ιδρύει ζωή τεχνική
δυνατότητες
περιορισμό συνόλου
ασκεί παραγνώριση συναφθεί
αναγνωρίζουν ποινικής εκδηλώνει
κοινότητας διακυβέρνηση στα
απέναντι υψηλή
περιστάσεων αξιόποινη
σεβασμό
συντήρησής κατά εξασφαλίσουν
παραβιάζουν συμπληρώνεται νόμο
άμεσα
σημαίνει καθεστώς
ΑΝΘΡΩΠΙΝΑ θέλησης ανθρωπίνων
ΔΙΑΚΗΡΥΞΗ αθλιότητα ασφάλιση
μέσο
ίση Εχει
ειρήνης Κάθε
μέλη μορφή
όσο
κρατείται Στο Διακηρύσσει
οικονομικών έκφρασης εξασφαλίζεται
κάθε
περίπτωση απολαμβάνουν
ποινικό γεροντική
είναι μαζί δικαστήρια
μαζί προοπτική
δική
βαρβαρότητας
οικονομικών εξασφαλίσει
υποχρεώσεις οδήγησαν
Οικουμενική Διακήρυξης γονείς
στις μυστική αντιπροσώπους
Διακήρυξης άδειες βιοτικό
αναπηρία ομάδα
πραγματικό
καλύτερες
ανάπαυση
δίκαιες ένα δικαίου
μετέχει στους
θρησκευτικών ποινικής
Κανείς ίσα
πεποιθήσεις
πολιτικές ανάλογα δουλεία
πολιτικές ιατρική ωσότου
ηθικής χωρίς
ανδρών ικανό
καθώς

View File

@@ -0,0 +1,54 @@
# This file is part of ICU4X. For terms of use, please see the file
# called LICENSE at the top level of the ICU4X source tree
# (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
duración común
delito reconocimiento alimentación
inalienables
entre seguridad escogidos
comportarse dignidad
autónomo gobierno tiempo
omisiones
comisión
Derechos territorios
debe
han
regresar inalienables
regresar
desempleo científico
arbitrariamente proclamada
están contraerse esposos
cualesquiera
salir carácter desarrollo
solamente justas
personalidad una
cuanto
garantice resolución
concepción
tomar impondrá
cualquier reconocimiento
obligatoria obligatoria satisfactoria
acusación sin
artísticas penal culturales
pagadas examen
Además Organización dignidad
opresión esposos ejercidos
barbarie están mientras
por
idioma
recursos pagadas
materia Nada ella
con injerencias
inspirándose
organización
gozar jurisdicción
que
asegurar
humana libertad
nadie equivalente
escoger remuneración
torturas
individuos poder
disfruten seres Preámbulo
desempleo
liberados

View File

@@ -0,0 +1,54 @@
# This file is part of ICU4X. For terms of use, please see the file
# called LICENSE at the top level of the ICU4X source tree
# (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
conforme êtres fonctions
non tout généralisé
premier lui
faire hommes dégalité
peuple volonté bénéficier
générale nationales
cruels plus
dencourager opinions
genre lesprit
dorigine effectif
exigences auront
résultent situation recevoir
peuples Chacun
sont dégalité
jouissent
auront lesprit
pays telle
publiquement
mariage foi
travail démocratique religieux
rémunération
omissions telles
Léducation
raison complétée donner
invoqué auront arbitraires
lamitié suffisant affaires
travaille laccomplissement lintermédiaire
race
opinions celles
assurer par privée
valeur
violant traite premier
inhérente
bienfaits lavènement
Unies sil actions
inquiété lesclavage
inquiété
esclaves lieu
salaire
par
toute
innocente procédure membres
arts lidéal envers
suffrage territoires inhumains
dimmixtions lorganisation progrès
comme égalité Unies
maternité
violerait suprême sécurité
impliquant eux loisirs
nationalité

View File

@@ -0,0 +1,54 @@
# This file is part of ICU4X. For terms of use, please see the file
# called LICENSE at the top level of the ICU4X source tree
# (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
זקנה משפילים
ינתן חברתי עניניו
הפוב
ולהיות זכויות הישגים
יאסרו מטעמי וללא
ספרותית השלם
למנוחה חינם
וההתאגדות
לטפח
באלה במלואן
יהנו
ולרווחתם לגבר האדם
בכבודו שבארצות כבוד
ובינלאומיים
בכך לתנאי אישי
שאינן
שרירותי
במשפט
ולעקרונותיהן מטעם
שרירותית האשמה יהיה
החינוך ולבטחון
סובלנות אשמתו במגילה
המאוחדות חיוני
חשוב במקרה
כלתי העולם
שמקורה כציבור
לשויון
לתקנה
תלוי ההתאספות
הדיבור שהוא
והבלתי והבסיסית
ולעקרונותיהן יהא וישאף
ביתנ הבינלאומי
והזלזול להקנות
בגלל כולם שיושלם
לחיים
בדבר
לשירות
זכויות
לפני
אדם ולא מזזמנות
קנינו שהיה ההתאספות
בינלאומי חיוניות לבקש
תהיינה
ובזכות בכורה מהגנה
מתוך
ובמצפון מזומנות לאגד
והחמריים סוציאלי
אנושיים ובהצבעה
פראיים

View File

@@ -0,0 +1,54 @@
# This file is part of ICU4X. For terms of use, please see the file
# called LICENSE at the top level of the ICU4X source tree
# (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
uciskowi posiadania prawo
społecznego największych skazany
czy
potrzeby samodzielnie przystępowania
Krzewi też dokonania
pełną prawo
buntu
moralności
zapewnienia znaczenie
nieludzki wypadek Nikt
zasadności jakikolwiek Każdy
samowolnie krajem
międzynarodowego
członek wielu
rozwój wynikających obalenia
rasy
grudnia która
jedynie urlopu ani
małżeńskie stanowi ustaniu
człowieka postępowych
prześladowania
politycznej które zawarcia
Deklaracja
ingerować wyłącznie
studia Nikt
innego uprawianie zrozumienie
wybranych swobodę wyznania
wolni osobowości
ograniczenie Nie
równej społecznego uciekać
będącą POWSZECHNA
niezdolności poszukiwania międzynarodowej
konieczne potrzeby posiada
opinii wychowywania 1948
międzynarodowej zatrzymać
przedstawicieli
przeciw
wynikających organy pracę
człowiek grupami
niezbędnych
wolności podstawowym
opinii małżonków wolność
postępować zdecydowanie komórką
odniesieniu
pokoju azyl
zawodowych powrócić człowiek
konstytucję
takiej postaciach powszechnego
wygnać wygnać
wspólny poszanowania

View File

@@ -0,0 +1,54 @@
# This file is part of ICU4X. For terms of use, please see the file
# called LICENSE at the top level of the ICU4X source tree
# (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
областях
будут должен
обеспечиваются нежели
котором Уставе
социального моральных
совершеннолетия предоставление
том независимо
существование
вмешательства какому ограниченной
распространять
находить помощь
искусством
унижающим положения искать
изгнанию член совершеннолетия
обществом имуществом государственной
идеи братства
наслаждаться значение социальной
осуществления юрисдикцией наказанию
достойное свою III
жизнь расторжения инвалидности
терпимости этого
целях равны
обеспечиваются законным
принуждаем правосубъектности
пыткам доступа неприкосновенность
Брак против
прибегать независимой
человека человеческой
быть независимо религии
публичным
членам против
разумом результатом семью
Принята участие
беспристрастным тем
частным основной
правового
страной обслуживание
было свободу полное
рабочего свободны
состоянии помощь религиозными
полное
владеть власти морали
меньшей
братства социальному убежища
государств
равны который дети
терпимости
получать бесплатным полного
богослужении
отдельным

View File

@@ -0,0 +1,54 @@
# This file is part of ICU4X. For terms of use, please see the file
# called LICENSE at the top level of the ICU4X source tree
# (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
คิด ใตัอำ เคลื่อนไหว
บังคับ บาก
สิ่ง สิ้น
วัตถุ
ชาย อาศัย เท่านั้น
สิน
เกา
ดูแล พิธีกรรม
ภายใน
เพศ
หนัก ประสงค์
เหตุ
งาน รักษา
เพศ ภาษา
นี้
คู่ สัญชาติ ต้องการ
วิธี ระหว่าง ตกลง
ทำนอง
สืบ กับ ศิลปกรรม
เหนือ วรรณกรรม
คิด การก หน้าที่
ชาติ ศิลปกรรม แต่
สามัญ สอด
เหยียด วิธี จุด
หน้า ถ้า เบื้อง
ประชุม
ศิลปกรรม
เสรีภาพ โหด ก่อ
เกียรติศักดิ์ ป่วย เอกราช
ประหัต มโนธรรม การ
แทน
ขัดขืน เวลา เสียง
กฎบัตร พยายาม
สิน หน้า
จำเป็น
ประชาธิปไตย หน่วย
กรณี จริงจัง
ทำนอง
ทาษ
เพิ่ม
บรรดา ขวาง
กักขัง
มนุษย์
ชาย ประกัน มนุษยธรรม
จะบัน มูลฐาน เถื่อน
พฤติ
มิได้
หญิง คู่
สมา ปฏิบัติ อนึ่ง
สิ่ง ทาษ

View File

@@ -0,0 +1,54 @@
# This file is part of ICU4X. For terms of use, please see the file
# called LICENSE at the top level of the ICU4X source tree
# (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
mecburidir ilim
isnadın sınırları suç
tutuklanamaz diğer
memleket korunmasi kullanılamaz
İnsanlık ilerlemeyi
bir mülk menfaatlerinin
usul zümreye herhangi
mahkeme vicdana ilerleyişe
zulüm zalimane
ilim öncelikle çocuk
mevzubahis ancak
muamelesi dinlenmeye
eşitlikle prensiplerine ülkenin
öğretim bulunmalarına yardım
memleketler amacıyla
birbirlerine
olmalıdır
bırakılamaz serbestisine
hürriyetin iyi
hükmü işbu zalimane
evlenme memleketi tedbirlerle
evlenmek ahalisi işini
hürriyetler
belirlenmiş kere
elde cürüme
tanınan dünyaca yüksek
müddetinin ailesine
vicdan kırıcı itibariyle
geniş inanma
kendi görevleri Teşkilatı
yaymak
öğretim vesayet
renk kişiliğinin
tamamlanan
haklara bulunma
hükmü uygulanabilecek
etmiş geliştirilmesini hoşgörü
sahiptir temel
giyim
Bundan temeli
icaplarını
mülk karışma tekmil
vicdana hürriyetine işini
Herkesin vahşiliklere
dolaşma dünyanın
davasının Uluslararasında idamesi
eşittir
haklardan hakkı
kovuşturmalar hürriyetlerden gözönünde
Evrensel fiilli beyannamesi

View File

@@ -0,0 +1,58 @@
# This file is part of ICU4X. For terms of use, please see the file
# called LICENSE at the top level of the ICU4X source tree
# (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
# The contents of this file have been translated by "Google Translate".
Vào những năm cuối của thế kỷ 19, không ai có thể tin rằng thế giới này
đang được theo dõi một cách sâu sắc và chặt chẽ bởi những trí thông minh
lớn hơn con người nhưng cũng nguy hiểm như chính con người; rằng khi con
người bận rộn với những mối quan tâm khác nhau của họ, họ bị xem xét và
nghiên cứu kỹ lưỡng, có lẽ gần như một người đàn ông với kính hiển vi có thể
xem xét kỹ lưỡng những sinh vật nhất thời tụ tập và sinh sôi nảy nở trong
một giọt nước. Với sự tự mãn vô hạn, con người đi đi lại lại khắp thế giới
này chỉ vì những công việc nhỏ nhặt của họ, thanh thản với niềm tin chắc
chắn về đế chế của họ đối với vật chất. Có thể là infusoria dưới kính hiển
vi cũng làm như vậy. Không ai coi các thế giới cũ hơn trong không gian là
nguồn gây nguy hiểm cho con người, hoặc nghĩ về chúng chỉ để bác bỏ ý
tưởng về sự sống đối với chúng là không thể hoặc không thể xảy ra.
Thật tò mò khi nhớ lại một số thói quen tinh thần của những ngày đã
qua. Hầu hết những người trên trái đất đều tưởng tượng rằng có thể có
những người khác trên sao Hỏa, có lẽ thấp kém hơn họ và sẵn sàng chào
đón một doanh nghiệp truyền giáo. Tuy nhiên, bên kia vịnh không gian,
những bộ óc đối với tâm trí của chúng ta cũng như tâm trí của chúng ta đối
với những con thú bị diệt vong, những bộ óc rộng lớn, lạnh lùng và vô cảm,
nhìn trái đất này với con mắt ghen tị, và dần dần và chắc chắn vạch ra
những kế hoạch chống lại chúng ta. Và đầu thế kỷ 20 đã xảy ra sự vỡ mộng
lớn. Hành tinh sao Hỏa, tôi không cần nhắc độc giả, quay xung quanh mặt
trời ở khoảng cách trung bình 140.000.000 dặm, và ánh sáng và nhiệt mà
nó nhận được từ mặt trời chỉ bằng một nửa so với thế giới này nhận được.
Nếu giả thuyết về tinh vân có bất kỳ sự thật nào, nó phải tồn tại lâu
đời hơn thế giới của chúng ta; và rất lâu trước khi trái đất này ngừng
nóng chảy, sự sống trên bề mặt của nó hẳn đã bắt đầu quá trình của nó.
Thực tế là nó chỉ chiếm một phần bảy thể tích của trái đất đã làm tăng
tốc độ nguội đi của nó đến nhiệt độ mà sự sống có thể bắt đầu. Nó có
không khí và nước và tất cả những gì cần thiết để hỗ trợ sự tồn tại
sinh động. Tuy nhiên, con người quá hão huyền và bị mù quáng bởi sự phù
phiếm của mình, đến nỗi cho đến tận cuối thế kỷ 19, không có nhà văn nào
bày tỏ bất kỳ ý tưởng nào rằng sự sống thông minh có thể đã phát triển ở đó xa,
hoặc thực sự là ở tất cả, vượt ra ngoài mức độ trần gian của nó. Người ta
cũng không hiểu một cách tổng quát rằng vì sao Hỏa già hơn trái đất của chúng
ta, chỉ bằng một phần tư diện tích bề mặt và ở xa mặt trời hơn, nên điều tất
yếu dẫn đến là nó không chỉ xa hơn so với thời điểm bắt đầu mà còn gần ngày kết
thúc hơn. Sự nguội lạnh thế tục mà một ngày nào đó phải vượt qua hành tinh của chúng
ta đã thực sự đi xa với người hàng xóm của chúng ta. Tình trạng vật lý của nó phần lớn
vẫn còn là một bí ẩn, nhưng giờ đây chúng ta biết rằng ngay cả ở vùng xích đạo của nó,
nhiệt độ giữa trưa hầu như không bằng nhiệt độ của mùa đông lạnh nhất của chúng ta.
Không khí của nó loãng hơn nhiều so với không khí của chúng ta, các đại dương của nó đã
thu hẹp lại cho đến khi chỉ bao phủ một phần ba bề mặt của nó, và khi các mùa chậm chạp
của nó thay đổi, các chỏm tuyết khổng lồ tụ lại và tan chảy ở hai cực và định kỳ làm ngập các vùng ôn đới của nó.
Giai đoạn cuối cùng của sự kiệt sức, mà đối với chúng ta vẫn còn quá xa vời, đã trở thành
một vấn đề ngày nay đối với các cư dân trên sao Hỏa. Áp lực trước mắt của sự cần
thiết đã làm sáng tỏ trí tuệ của họ, mở rộng sức mạnh của họ và làm chai đá trái
tim họ. Và nhìn xuyên qua không gian với các công cụ, và trí thông minh như chúng
ta hiếm khi mơ tới, họ thấy, ở khoảng cách gần nhất chỉ cách họ 35.000.000 dặm
về phía mặt trời, một ngôi sao buổi sáng của hy vọng, hành tinh ấm áp hơn của chúng
ta, màu xanh lục của thảm thực vật và màu xám của nước , với bầu không khí nhiều
mây hùng hồn của sự màu mỡ, với những cái nhìn thoáng qua qua những đám mây
trôi dạt của nó là những dải đất rộng lớn đông dân và những vùng biển chật hẹp đông đúc hải quân.

View File

@@ -0,0 +1,213 @@
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
use criterion::{black_box, BenchmarkId, Criterion};
use icu_normalizer::{ComposingNormalizerBorrowed, DecomposingNormalizerBorrowed};
struct BenchDataContent {
pub file_name: String,
pub nfc: String,
pub nfd: String,
pub nfkc: String,
pub nfkd: String,
pub nfc_u16: Vec<u16>,
pub nfd_u16: Vec<u16>,
pub nfkc_u16: Vec<u16>,
pub nfkd_u16: Vec<u16>,
}
fn strip_headers(content: &str) -> String {
content
.lines()
.filter(|&s| !s.starts_with('#'))
.map(|s| s.to_owned())
.collect::<Vec<String>>()
.join("\n")
}
fn normalizer_bench_data() -> [BenchDataContent; 15] {
let nfc_normalizer = ComposingNormalizerBorrowed::new_nfc();
let nfd_normalizer = DecomposingNormalizerBorrowed::new_nfd();
let nfkc_normalizer = ComposingNormalizerBorrowed::new_nfkc();
let nfkd_normalizer = DecomposingNormalizerBorrowed::new_nfkd();
let content_latin: (&str, &str) = (
"TestNames_Latin",
&strip_headers(include_str!("./data/TestNames_Latin.txt")),
);
let content_jp_h: (&str, &str) = (
"TestNames_Japanese_h",
&strip_headers(include_str!("./data/TestNames_Japanese_h.txt")),
);
let content_jp_k: (&str, &str) = (
"TestNames_Japanese_k",
&strip_headers(include_str!("./data/TestNames_Japanese_k.txt")),
);
let content_korean: (&str, &str) = (
"TestNames_Korean",
&strip_headers(include_str!("./data/TestNames_Korean.txt")),
);
let content_random_words_ar: (&str, &str) = (
"TestRandomWordsUDHR_ar",
&strip_headers(include_str!("./data/TestRandomWordsUDHR_ar.txt")),
);
let content_random_words_de: (&str, &str) = (
"TestRandomWordsUDHR_de",
&strip_headers(include_str!("./data/TestRandomWordsUDHR_de.txt")),
);
let content_random_words_el: (&str, &str) = (
"TestRandomWordsUDHR_el",
&strip_headers(include_str!("./data/TestRandomWordsUDHR_el.txt")),
);
let content_random_words_es: (&str, &str) = (
"TestRandomWordsUDHR_es",
&strip_headers(include_str!("./data/TestRandomWordsUDHR_es.txt")),
);
let content_random_words_fr: (&str, &str) = (
"TestRandomWordsUDHR_fr",
&strip_headers(include_str!("./data/TestRandomWordsUDHR_fr.txt")),
);
let content_random_words_he: (&str, &str) = (
"TestRandomWordsUDHR_he",
&strip_headers(include_str!("./data/TestRandomWordsUDHR_he.txt")),
);
let content_random_words_pl: (&str, &str) = (
"TestRandomWordsUDHR_pl",
&strip_headers(include_str!("./data/TestRandomWordsUDHR_pl.txt")),
);
let content_random_words_ru: (&str, &str) = (
"TestRandomWordsUDHR_ru",
&strip_headers(include_str!("./data/TestRandomWordsUDHR_ru.txt")),
);
let content_random_words_th: (&str, &str) = (
"TestRandomWordsUDHR_th",
&strip_headers(include_str!("./data/TestRandomWordsUDHR_th.txt")),
);
let content_random_words_tr: (&str, &str) = (
"TestRandomWordsUDHR_tr",
&strip_headers(include_str!("./data/TestRandomWordsUDHR_tr.txt")),
);
let content_viet: (&str, &str) = ("wotw", &strip_headers(include_str!("./data/wotw.txt")));
[
content_latin,
content_viet,
content_jp_k,
content_jp_h,
content_korean,
content_random_words_ru,
content_random_words_ar,
content_random_words_el,
content_random_words_es,
content_random_words_fr,
content_random_words_tr,
content_random_words_th,
content_random_words_pl,
content_random_words_he,
content_random_words_de,
]
.map(|(file_name, raw_content)| {
let nfc = &nfc_normalizer.normalize(raw_content);
let nfd = &nfd_normalizer.normalize(raw_content);
let nfkc = &nfkc_normalizer.normalize(raw_content);
let nfkd = &nfkd_normalizer.normalize(raw_content);
BenchDataContent {
file_name: file_name.to_owned(),
nfc: nfc.to_string(),
nfd: nfd.to_string(),
nfkc: nfkc.to_string(),
nfkd: nfkd.to_string(),
nfc_u16: nfc.encode_utf16().collect(),
nfd_u16: nfd.encode_utf16().collect(),
nfkc_u16: nfkc.encode_utf16().collect(),
nfkd_u16: nfkd.encode_utf16().collect(),
}
})
}
fn function_under_bench(normalizer: &DecomposingNormalizerBorrowed, text: &str) {
normalizer.normalize(text);
}
fn function_under_bench_u16(normalizer: &DecomposingNormalizerBorrowed, text: &[u16]) {
normalizer.normalize_utf16(text);
}
pub fn criterion_benchmark(criterion: &mut Criterion) {
let group_name = "decomposing_normalizer_nfd";
let normalizer_under_bench = DecomposingNormalizerBorrowed::new_nfd();
let mut group = criterion.benchmark_group(group_name);
for bench_data_content in black_box(normalizer_bench_data()) {
group.bench_function(
BenchmarkId::from_parameter(format!("from_nfc_{}", bench_data_content.file_name)),
|bencher| {
bencher
.iter(|| function_under_bench(&normalizer_under_bench, &bench_data_content.nfc))
},
);
group.bench_function(
BenchmarkId::from_parameter(format!("from_nfd_{}", bench_data_content.file_name)),
|bencher| {
bencher
.iter(|| function_under_bench(&normalizer_under_bench, &bench_data_content.nfd))
},
);
group.bench_function(
BenchmarkId::from_parameter(format!("from_nfkc_{}", bench_data_content.file_name)),
|bencher| {
bencher.iter(|| {
function_under_bench(&normalizer_under_bench, &bench_data_content.nfkc)
})
},
);
group.bench_function(
BenchmarkId::from_parameter(format!("from_nfkd_{}", bench_data_content.file_name)),
|bencher| {
bencher.iter(|| {
function_under_bench(&normalizer_under_bench, &bench_data_content.nfkd)
})
},
);
// UTF 16
group.bench_function(
BenchmarkId::from_parameter(format!("from_nfc_{}_u16", bench_data_content.file_name)),
|bencher| {
bencher.iter(|| {
function_under_bench_u16(&normalizer_under_bench, &bench_data_content.nfc_u16)
})
},
);
group.bench_function(
BenchmarkId::from_parameter(format!("from_nfd_{}_u16", bench_data_content.file_name)),
|bencher| {
bencher.iter(|| {
function_under_bench_u16(&normalizer_under_bench, &bench_data_content.nfd_u16)
})
},
);
group.bench_function(
BenchmarkId::from_parameter(format!("from_nfkc_{}_u16", bench_data_content.file_name)),
|bencher| {
bencher.iter(|| {
function_under_bench_u16(&normalizer_under_bench, &bench_data_content.nfkc_u16)
})
},
);
group.bench_function(
BenchmarkId::from_parameter(format!("from_nfkd_{}_u16", bench_data_content.file_name)),
|bencher| {
bencher.iter(|| {
function_under_bench_u16(&normalizer_under_bench, &bench_data_content.nfkd_u16)
})
},
);
}
group.finish();
}

View File

@@ -0,0 +1,211 @@
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
use criterion::{black_box, BenchmarkId, Criterion};
use icu_normalizer::{ComposingNormalizerBorrowed, DecomposingNormalizerBorrowed};
struct BenchDataContent {
pub file_name: String,
pub nfc: String,
pub nfd: String,
pub nfkc: String,
pub nfkd: String,
pub nfc_u16: Vec<u16>,
pub nfd_u16: Vec<u16>,
pub nfkc_u16: Vec<u16>,
pub nfkd_u16: Vec<u16>,
}
fn strip_headers(content: &str) -> String {
content
.lines()
.filter(|&s| !s.starts_with('#'))
.map(|s| s.to_owned())
.collect::<Vec<String>>()
.join("\n")
}
fn normalizer_bench_data() -> [BenchDataContent; 15] {
let nfc_normalizer = ComposingNormalizerBorrowed::new_nfc();
let nfd_normalizer = DecomposingNormalizerBorrowed::new_nfd();
let nfkc_normalizer = ComposingNormalizerBorrowed::new_nfkc();
let nfkd_normalizer = DecomposingNormalizerBorrowed::new_nfkd();
let content_latin: (&str, &str) = (
"TestNames_Latin",
&strip_headers(include_str!("./data/TestNames_Latin.txt")),
);
let content_jp_h: (&str, &str) = (
"TestNames_Japanese_h",
&strip_headers(include_str!("./data/TestNames_Japanese_h.txt")),
);
let content_jp_k: (&str, &str) = (
"TestNames_Japanese_k",
&strip_headers(include_str!("./data/TestNames_Japanese_k.txt")),
);
let content_korean: (&str, &str) = (
"TestNames_Korean",
&strip_headers(include_str!("./data/TestNames_Korean.txt")),
);
let content_random_words_ar: (&str, &str) = (
"TestRandomWordsUDHR_ar",
&strip_headers(include_str!("./data/TestRandomWordsUDHR_ar.txt")),
);
let content_random_words_de: (&str, &str) = (
"TestRandomWordsUDHR_de",
&strip_headers(include_str!("./data/TestRandomWordsUDHR_de.txt")),
);
let content_random_words_el: (&str, &str) = (
"TestRandomWordsUDHR_el",
&strip_headers(include_str!("./data/TestRandomWordsUDHR_el.txt")),
);
let content_random_words_es: (&str, &str) = (
"TestRandomWordsUDHR_es",
&strip_headers(include_str!("./data/TestRandomWordsUDHR_es.txt")),
);
let content_random_words_fr: (&str, &str) = (
"TestRandomWordsUDHR_fr",
&strip_headers(include_str!("./data/TestRandomWordsUDHR_fr.txt")),
);
let content_random_words_he: (&str, &str) = (
"TestRandomWordsUDHR_he",
&strip_headers(include_str!("./data/TestRandomWordsUDHR_he.txt")),
);
let content_random_words_pl: (&str, &str) = (
"TestRandomWordsUDHR_pl",
&strip_headers(include_str!("./data/TestRandomWordsUDHR_pl.txt")),
);
let content_random_words_ru: (&str, &str) = (
"TestRandomWordsUDHR_ru",
&strip_headers(include_str!("./data/TestRandomWordsUDHR_ru.txt")),
);
let content_random_words_th: (&str, &str) = (
"TestRandomWordsUDHR_th",
&strip_headers(include_str!("./data/TestRandomWordsUDHR_th.txt")),
);
let content_random_words_tr: (&str, &str) = (
"TestRandomWordsUDHR_tr",
&strip_headers(include_str!("./data/TestRandomWordsUDHR_tr.txt")),
);
let content_viet: (&str, &str) = ("wotw", &strip_headers(include_str!("./data/wotw.txt")));
[
content_latin,
content_viet,
content_jp_k,
content_jp_h,
content_korean,
content_random_words_ru,
content_random_words_ar,
content_random_words_el,
content_random_words_es,
content_random_words_fr,
content_random_words_tr,
content_random_words_th,
content_random_words_pl,
content_random_words_he,
content_random_words_de,
]
.map(|(file_name, raw_content)| {
let nfc = &nfc_normalizer.normalize(raw_content);
let nfd = &nfd_normalizer.normalize(raw_content);
let nfkc = &nfkc_normalizer.normalize(raw_content);
let nfkd = &nfkd_normalizer.normalize(raw_content);
BenchDataContent {
file_name: file_name.to_owned(),
nfc: nfc.to_string(),
nfd: nfd.to_string(),
nfkc: nfkc.to_string(),
nfkd: nfkd.to_string(),
nfc_u16: nfc.encode_utf16().collect(),
nfd_u16: nfd.encode_utf16().collect(),
nfkc_u16: nfkc.encode_utf16().collect(),
nfkd_u16: nfkd.encode_utf16().collect(),
}
})
}
fn function_under_bench(normalizer: &DecomposingNormalizerBorrowed, text: &str) {
normalizer.normalize(text);
}
fn function_under_bench_u16(normalizer: &DecomposingNormalizerBorrowed, text: &[u16]) {
normalizer.normalize_utf16(text);
}
pub fn criterion_benchmark(criterion: &mut Criterion) {
let group_name = "decomposing_normalizer_nfkd";
let normalizer_under_bench = DecomposingNormalizerBorrowed::new_nfkd();
let mut group = criterion.benchmark_group(group_name);
for bench_data_content in black_box(normalizer_bench_data()) {
group.bench_function(
BenchmarkId::from_parameter(format!("from_nfc_{}", bench_data_content.file_name)),
|bencher| {
bencher
.iter(|| function_under_bench(&normalizer_under_bench, &bench_data_content.nfc))
},
);
group.bench_function(
BenchmarkId::from_parameter(format!("from_nfd_{}", bench_data_content.file_name)),
|bencher| {
bencher
.iter(|| function_under_bench(&normalizer_under_bench, &bench_data_content.nfd))
},
);
group.bench_function(
BenchmarkId::from_parameter(format!("from_nfkc_{}", bench_data_content.file_name)),
|bencher| {
bencher.iter(|| {
function_under_bench(&normalizer_under_bench, &bench_data_content.nfkc)
})
},
);
group.bench_function(
BenchmarkId::from_parameter(format!("from_nfkd_{}", bench_data_content.file_name)),
|bencher| {
bencher.iter(|| {
function_under_bench(&normalizer_under_bench, &bench_data_content.nfkd)
})
},
);
// UTF 16
group.bench_function(
BenchmarkId::from_parameter(format!("from_nfc_{}_u16", bench_data_content.file_name)),
|bencher| {
bencher.iter(|| {
function_under_bench_u16(&normalizer_under_bench, &bench_data_content.nfc_u16)
})
},
);
group.bench_function(
BenchmarkId::from_parameter(format!("from_nfd_{}_u16", bench_data_content.file_name)),
|bencher| {
bencher.iter(|| {
function_under_bench_u16(&normalizer_under_bench, &bench_data_content.nfd_u16)
})
},
);
group.bench_function(
BenchmarkId::from_parameter(format!("from_nfkc_{}_u16", bench_data_content.file_name)),
|bencher| {
bencher.iter(|| {
function_under_bench_u16(&normalizer_under_bench, &bench_data_content.nfkc_u16)
})
},
);
group.bench_function(
BenchmarkId::from_parameter(format!("from_nfkd_{}_u16", bench_data_content.file_name)),
|bencher| {
bencher.iter(|| {
function_under_bench_u16(&normalizer_under_bench, &bench_data_content.nfkd_u16)
})
},
);
}
group.finish();
}

View File

@@ -0,0 +1,170 @@
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
use icu_normalizer::{ComposingNormalizerBorrowed, DecomposingNormalizerBorrowed};
use criterion::{black_box, Criterion, Throughput};
use smallvec::SmallVec;
//use detone::IterDecomposeVietnamese;
// 2048 times size of u16 fits on one 4KB memory page, which maximizes
// the run to take average over without introducing cross-page effects.
const INPUT_SIZE: usize = 2048;
fn generate_bmp_input_nfc(s: &str) -> Vec<u16> {
ComposingNormalizerBorrowed::new_nfc()
.normalize_iter(s.chars().cycle())
.take(INPUT_SIZE)
.map(|c| {
if c <= '\u{FFFF}' {
c as u16
} else {
unreachable!("Data should stay on the BMP!")
}
})
.collect()
}
fn generate_bmp_input_nfd(s: &str) -> Vec<u16> {
DecomposingNormalizerBorrowed::new_nfd()
.normalize_iter(s.chars().cycle())
.take(INPUT_SIZE)
.map(|c| {
if c <= '\u{FFFF}' {
c as u16
} else {
unreachable!("Data should stay on the BMP!")
}
})
.collect()
}
/// Removes headers and replaces line feed with space.
/// Do not use for languages that don't use spaces!
fn prepare_file_contents(content: &str) -> String {
content
.lines()
.filter(|&s| !s.starts_with('#'))
.map(|s| s.to_owned())
.collect::<Vec<String>>()
.join(" ")
}
fn slice_as_slice(s: &[u16]) -> &[u16] {
black_box(s)
}
fn bench_lang(name: &str, data: &str, c: &mut Criterion) {
let input_nfc = generate_bmp_input_nfc(data);
let input_nfd = generate_bmp_input_nfd(data);
let nfc = ComposingNormalizerBorrowed::new_nfc();
let nfd = DecomposingNormalizerBorrowed::new_nfd();
// Appending to this output is infallible (does not return `Err`) and
// this is sized to be large enough not to actually take the the heap
// allocation path.
let mut output: SmallVec<[u16; INPUT_SIZE * 2]> = SmallVec::new();
{
let mut group_name = "utf16_throughput_nfc_".to_string();
group_name.push_str(name);
let mut group = c.benchmark_group(&group_name);
group.throughput(Throughput::Elements(input_nfc.len() as u64));
group.bench_function("read", |b| {
b.iter(|| {
let _ = black_box(
nfc.split_normalized_utf16(slice_as_slice(&input_nfc))
.0
.len(),
);
})
});
group.bench_function("writing_to_nfc", |b| {
b.iter(|| {
output.clear(); // Should be trivial and OK to do from within here.
let _ = black_box(
nfc.normalize_utf16_to(slice_as_slice(&input_nfc), black_box(&mut output)),
);
})
});
group.bench_function("writing_to_nfd", |b| {
b.iter(|| {
output.clear(); // Should be trivial and OK to do from within here.
let _ = black_box(
nfd.normalize_utf16_to(slice_as_slice(&input_nfc), black_box(&mut output)),
);
})
});
group.finish();
}
{
let mut group_name = "utf16_throughput_nfd_".to_string();
group_name.push_str(name);
let mut group = c.benchmark_group(&group_name);
group.throughput(Throughput::Elements(input_nfd.len() as u64));
group.bench_function("read", |b| {
b.iter(|| {
let _ = black_box(
nfd.split_normalized_utf16(slice_as_slice(&input_nfd))
.0
.len(),
);
})
});
group.bench_function("writing_to_nfd", |b| {
b.iter(|| {
output.clear(); // Should be trivial and OK to do from within here.
let _ = black_box(
nfd.normalize_utf16_to(slice_as_slice(&input_nfd), black_box(&mut output)),
);
})
});
group.bench_function("writing_to_nfc", |b| {
b.iter(|| {
output.clear(); // Should be trivial and OK to do from within here.
let _ = black_box(
nfc.normalize_utf16_to(slice_as_slice(&input_nfd), black_box(&mut output)),
);
})
});
group.finish();
}
}
static EL: &str = include_str!("./data/TestRandomWordsUDHR_el.txt");
static EN: &str = "The ICU4X normalizer is an implementation of Unicode Normalization Forms. ";
static FR: &str = include_str!("./data/TestRandomWordsUDHR_fr.txt");
static VI: &str = include_str!("./data/wotw.txt");
static ZH: &str = "單父人呂公善沛令,辟仇,從之客,因家焉。沛中豪傑吏聞令有重客,皆往賀。";
// zh text from https://www.gutenberg.org/cache/epub/23841/pg23841.txt
// metadata at https://www.gutenberg.org/ebooks/23841
// If you replace this text, be sure not to include ASCII spaces and be sure
// to include punctuation using code points actually used for punctuation in
// Chinese.
// TODO: Add:
// * Japanese with realistic proportion of kana voicing marks
// * Korean, since Hangul is special-cased in the normalizer
// * Kannada or some other non-Korean BMP language that uses
// backward-combining starters (with realistic proportion of such
// characters).
// * Chakma or some other living non-BMP language.
// * Vietnamese in the orthographic form (i.e. as produced by
// the official non-IME keyboard layout that's less common
// than the NFC-producing IME.)
pub fn criterion_benchmark(c: &mut Criterion) {
bench_lang("el", prepare_file_contents(EL).as_str(), c);
bench_lang("en", EN, c);
bench_lang("fr", prepare_file_contents(FR).as_str(), c);
bench_lang("vi", prepare_file_contents(VI).as_str(), c);
bench_lang("zh", ZH, c);
}

3097
vendor/icu_normalizer/src/lib.rs vendored Normal file

File diff suppressed because it is too large Load Diff

663
vendor/icu_normalizer/src/properties.rs vendored Normal file
View File

@@ -0,0 +1,663 @@
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
//! Access to the Unicode properties or property-based operations that
//! are required for NFC and NFD.
//!
//! Applications should generally use the full normalizers that are
//! provided at the top level of this crate. However, the APIs in this
//! module are provided for callers such as HarfBuzz that specifically
//! want access to the raw canonical composition operation e.g. for use in a
//! glyph-availability-guided custom normalizer.
use crate::char_from_u16;
use crate::char_from_u32;
use crate::in_inclusive_range;
use crate::provider::CanonicalCompositions;
use crate::provider::DecompositionData;
use crate::provider::DecompositionTables;
use crate::provider::NonRecursiveDecompositionSupplement;
use crate::provider::NormalizerNfcV1;
use crate::provider::NormalizerNfdDataV1;
use crate::provider::NormalizerNfdSupplementV1;
use crate::provider::NormalizerNfdTablesV1;
use crate::trie_value_has_ccc;
use crate::CanonicalCombiningClass;
use crate::BACKWARD_COMBINING_MARKER;
use crate::FDFA_MARKER;
use crate::HANGUL_L_BASE;
use crate::HANGUL_N_COUNT;
use crate::HANGUL_S_BASE;
use crate::HANGUL_S_COUNT;
use crate::HANGUL_T_BASE;
use crate::HANGUL_T_COUNT;
use crate::HANGUL_V_BASE;
use crate::HIGH_ZEROS_MASK;
use crate::LOW_ZEROS_MASK;
use crate::NON_ROUND_TRIP_MARKER;
use icu_provider::prelude::*;
/// Borrowed version of the raw canonical composition operation.
///
/// Callers should generally use `ComposingNormalizer` instead of this API.
/// However, this API is provided for callers such as HarfBuzz that specifically
/// want access to the raw canonical composition operation e.g. for use in a
/// glyph-availability-guided custom normalizer.
#[derive(Debug, Copy, Clone)]
pub struct CanonicalCompositionBorrowed<'a> {
canonical_compositions: &'a CanonicalCompositions<'a>,
}
#[cfg(feature = "compiled_data")]
impl Default for CanonicalCompositionBorrowed<'static> {
fn default() -> Self {
Self::new()
}
}
impl CanonicalCompositionBorrowed<'static> {
/// Cheaply converts a [`CanonicalCompositionBorrowed<'static>`] into a [`CanonicalComposition`].
///
/// Note: Due to branching and indirection, using [`CanonicalComposition`] might inhibit some
/// compile-time optimizations that are possible with [`CanonicalCompositionBorrowed`].
pub const fn static_to_owned(self) -> CanonicalComposition {
CanonicalComposition {
canonical_compositions: DataPayload::from_static_ref(self.canonical_compositions),
}
}
/// Constructs a new `CanonicalComposition` using compiled data.
///
/// ✨ *Enabled with the `compiled_data` Cargo feature.*
///
/// [📚 Help choosing a constructor](icu_provider::constructors)
#[cfg(feature = "compiled_data")]
pub const fn new() -> Self {
Self {
canonical_compositions: crate::provider::Baked::SINGLETON_NORMALIZER_NFC_V1,
}
}
}
impl CanonicalCompositionBorrowed<'_> {
/// Performs canonical composition (including Hangul) on a pair of
/// characters or returns `None` if these characters don't compose.
/// Composition exclusions are taken into account.
///
/// # Examples
///
/// ```
/// let comp = icu::normalizer::properties::CanonicalCompositionBorrowed::new();
///
/// assert_eq!(comp.compose('a', 'b'), None); // Just two non-composing starters
/// assert_eq!(comp.compose('a', '\u{0308}'), Some('ä'));
/// assert_eq!(comp.compose('ẹ', '\u{0302}'), Some('ệ'));
/// assert_eq!(comp.compose('𝅗', '𝅥'), None); // Composition exclusion
/// assert_eq!(comp.compose('ে', 'া'), Some('ো')); // Second is starter
/// assert_eq!(comp.compose('ᄀ', 'ᅡ'), Some('가')); // Hangul LV
/// assert_eq!(comp.compose('가', 'ᆨ'), Some('각')); // Hangul LVT
/// ```
#[inline(always)]
pub fn compose(self, starter: char, second: char) -> Option<char> {
crate::compose(
self.canonical_compositions.canonical_compositions.iter(),
starter,
second,
)
}
}
/// The raw canonical composition operation.
///
/// Callers should generally use `ComposingNormalizer` instead of this API.
/// However, this API is provided for callers such as HarfBuzz that specifically
/// want access to the raw canonical composition operation e.g. for use in a
/// glyph-availability-guided custom normalizer.
#[derive(Debug)]
pub struct CanonicalComposition {
canonical_compositions: DataPayload<NormalizerNfcV1>,
}
#[cfg(feature = "compiled_data")]
impl Default for CanonicalComposition {
fn default() -> Self {
Self::new().static_to_owned()
}
}
impl CanonicalComposition {
/// Constructs a borrowed version of this type for more efficient querying.
pub fn as_borrowed(&self) -> CanonicalCompositionBorrowed<'_> {
CanonicalCompositionBorrowed {
canonical_compositions: self.canonical_compositions.get(),
}
}
/// Constructs a new `CanonicalCompositionBorrowed` using compiled data.
///
/// ✨ *Enabled with the `compiled_data` Cargo feature.*
///
/// [📚 Help choosing a constructor](icu_provider::constructors)
#[cfg(feature = "compiled_data")]
#[expect(clippy::new_ret_no_self)]
pub const fn new() -> CanonicalCompositionBorrowed<'static> {
CanonicalCompositionBorrowed::new()
}
icu_provider::gen_buffer_data_constructors!(() -> error: DataError,
functions: [
new: skip,
try_new_with_buffer_provider,
try_new_unstable,
Self,
]
);
#[doc = icu_provider::gen_buffer_unstable_docs!(UNSTABLE, Self::new)]
pub fn try_new_unstable<D>(provider: &D) -> Result<Self, DataError>
where
D: DataProvider<NormalizerNfcV1> + ?Sized,
{
let canonical_compositions: DataPayload<NormalizerNfcV1> =
provider.load(Default::default())?.payload;
Ok(CanonicalComposition {
canonical_compositions,
})
}
}
/// The outcome of non-recursive canonical decomposition of a character.
#[allow(clippy::exhaustive_enums)]
#[derive(Debug, PartialEq, Eq)]
pub enum Decomposed {
/// The character is its own canonical decomposition.
Default,
/// The character decomposes to a single different character.
Singleton(char),
/// The character decomposes to two characters.
Expansion(char, char),
}
/// Borrowed version of the raw (non-recursive) canonical decomposition operation.
///
/// Callers should generally use `DecomposingNormalizer` instead of this API.
/// However, this API is provided for callers such as HarfBuzz that specifically
/// want access to non-recursive canonical decomposition e.g. for use in a
/// glyph-availability-guided custom normalizer.
#[derive(Debug)]
pub struct CanonicalDecompositionBorrowed<'a> {
decompositions: &'a DecompositionData<'a>,
tables: &'a DecompositionTables<'a>,
non_recursive: &'a NonRecursiveDecompositionSupplement<'a>,
}
#[cfg(feature = "compiled_data")]
impl Default for CanonicalDecompositionBorrowed<'static> {
fn default() -> Self {
Self::new()
}
}
impl CanonicalDecompositionBorrowed<'static> {
/// Cheaply converts a [`CanonicalDecompositionBorrowed<'static>`] into a [`CanonicalDecomposition`].
///
/// Note: Due to branching and indirection, using [`CanonicalDecomposition`] might inhibit some
/// compile-time optimizations that are possible with [`CanonicalDecompositionBorrowed`].
pub const fn static_to_owned(self) -> CanonicalDecomposition {
CanonicalDecomposition {
decompositions: DataPayload::from_static_ref(self.decompositions),
tables: DataPayload::from_static_ref(self.tables),
non_recursive: DataPayload::from_static_ref(self.non_recursive),
}
}
/// Construct from compiled data.
///
/// ✨ *Enabled with the `compiled_data` Cargo feature.*
///
/// [📚 Help choosing a constructor](icu_provider::constructors)
#[cfg(feature = "compiled_data")]
pub const fn new() -> Self {
const _: () = assert!(
crate::provider::Baked::SINGLETON_NORMALIZER_NFD_TABLES_V1
.scalars16
.const_len()
+ crate::provider::Baked::SINGLETON_NORMALIZER_NFD_TABLES_V1
.scalars24
.const_len()
<= 0xFFF,
"future extension"
);
Self {
decompositions: crate::provider::Baked::SINGLETON_NORMALIZER_NFD_DATA_V1,
tables: crate::provider::Baked::SINGLETON_NORMALIZER_NFD_TABLES_V1,
non_recursive: crate::provider::Baked::SINGLETON_NORMALIZER_NFD_SUPPLEMENT_V1,
}
}
}
impl CanonicalDecompositionBorrowed<'_> {
/// Performs non-recursive canonical decomposition (including for Hangul).
///
/// ```
/// use icu::normalizer::properties::Decomposed;
/// let decomp = icu::normalizer::properties::CanonicalDecompositionBorrowed::new();
///
/// assert_eq!(decomp.decompose('e'), Decomposed::Default);
/// assert_eq!(
/// decomp.decompose('ệ'),
/// Decomposed::Expansion('ẹ', '\u{0302}')
/// );
/// assert_eq!(decomp.decompose('각'), Decomposed::Expansion('가', 'ᆨ'));
/// assert_eq!(decomp.decompose('\u{212B}'), Decomposed::Singleton('Å')); // ANGSTROM SIGN
/// assert_eq!(decomp.decompose('\u{2126}'), Decomposed::Singleton('Ω')); // OHM SIGN
/// assert_eq!(decomp.decompose('\u{1F71}'), Decomposed::Singleton('ά')); // oxia
/// ```
#[inline]
pub fn decompose(&self, c: char) -> Decomposed {
let lvt = u32::from(c).wrapping_sub(HANGUL_S_BASE);
if lvt >= HANGUL_S_COUNT {
return self.decompose_non_hangul(c);
}
// Invariant: lvt ≤ HANGUL_S_COUNT = 1172
let t = lvt % HANGUL_T_COUNT;
// Invariant: t ≤ (1172 / HANGUL_T_COUNT = 1172 / 28 = 41)
if t == 0 {
let l = lvt / HANGUL_N_COUNT;
// Invariant: v ≤ (1172 / HANGUL_N_COUNT = 1172 / 588 ≈ 2)
let v = (lvt % HANGUL_N_COUNT) / HANGUL_T_COUNT;
// Invariant: v < (HANGUL_N_COUNT / HANGUL_T_COUNT = 588 / 28 = 21)
return Decomposed::Expansion(
// Safety: HANGUL_*_BASE are 0x1nnn, addding numbers that are 21 and 41
// max will keep it in range, less than 0xD800
unsafe { char::from_u32_unchecked(HANGUL_L_BASE + l) },
unsafe { char::from_u32_unchecked(HANGUL_V_BASE + v) },
);
}
let lv = lvt - t;
// Invariant: lvt < 1172
// Safe because values known to be in range
Decomposed::Expansion(
// Safety: HANGUL_*_BASE are 0x1nnn, addding numbers that are 1172 and 41
// max will keep it in range, less than 0xD800
unsafe { char::from_u32_unchecked(HANGUL_S_BASE + lv) },
unsafe { char::from_u32_unchecked(HANGUL_T_BASE + t) },
)
}
/// Performs non-recursive canonical decomposition except Hangul syllables
/// are reported as `Decomposed::Default`.
#[inline(always)]
fn decompose_non_hangul(&self, c: char) -> Decomposed {
let decomposition = self.decompositions.trie.get(c);
// The REPLACEMENT CHARACTER has `NON_ROUND_TRIP_MARKER` set,
// and that flag needs to be ignored here.
if (decomposition & !(BACKWARD_COMBINING_MARKER | NON_ROUND_TRIP_MARKER)) == 0 {
return Decomposed::Default;
}
// The loop is only broken out of as goto forward
#[expect(clippy::never_loop)]
loop {
let high_zeros = (decomposition & HIGH_ZEROS_MASK) == 0;
let low_zeros = (decomposition & LOW_ZEROS_MASK) == 0;
if !high_zeros && !low_zeros {
// Decomposition into two BMP characters: starter and non-starter
if in_inclusive_range(c, '\u{1F71}', '\u{1FFB}') {
// Look in the other trie due to oxia singleton
// mappings to corresponding character with tonos.
break;
}
let starter = char_from_u32(decomposition & 0x7FFF);
let combining = char_from_u32((decomposition >> 15) & 0x7FFF);
return Decomposed::Expansion(starter, combining);
}
if high_zeros {
// Decomposition into one BMP character or non-starter
if trie_value_has_ccc(decomposition) {
// Non-starter
if !in_inclusive_range(c, '\u{0340}', '\u{0F81}') {
return Decomposed::Default;
}
return match c {
'\u{0340}' => {
// COMBINING GRAVE TONE MARK
Decomposed::Singleton('\u{0300}')
}
'\u{0341}' => {
// COMBINING ACUTE TONE MARK
Decomposed::Singleton('\u{0301}')
}
'\u{0343}' => {
// COMBINING GREEK KORONIS
Decomposed::Singleton('\u{0313}')
}
'\u{0344}' => {
// COMBINING GREEK DIALYTIKA TONOS
Decomposed::Expansion('\u{0308}', '\u{0301}')
}
'\u{0F73}' => {
// TIBETAN VOWEL SIGN II
Decomposed::Expansion('\u{0F71}', '\u{0F72}')
}
'\u{0F75}' => {
// TIBETAN VOWEL SIGN UU
Decomposed::Expansion('\u{0F71}', '\u{0F74}')
}
'\u{0F81}' => {
// TIBETAN VOWEL SIGN REVERSED II
Decomposed::Expansion('\u{0F71}', '\u{0F80}')
}
_ => Decomposed::Default,
};
}
let singleton = decomposition as u16;
debug_assert_ne!(
singleton, FDFA_MARKER,
"How come we got the U+FDFA NFKD marker here?"
);
return Decomposed::Singleton(char_from_u16(singleton));
}
if c == '\u{212B}' {
// ANGSTROM SIGN
return Decomposed::Singleton('\u{00C5}');
}
// Only 12 of 14 bits used as of Unicode 16.
let offset = (((decomposition & !(0b11 << 30)) >> 16) as usize) - 1;
// Only 3 of 4 bits used as of Unicode 16.
let len_bits = decomposition & 0b1111;
let tables = self.tables;
if offset < tables.scalars16.len() {
if len_bits != 0 {
// i.e. logical len isn't 2
break;
}
if let Some(first) = tables.scalars16.get(offset) {
if let Some(second) = tables.scalars16.get(offset + 1) {
// Two BMP starters
return Decomposed::Expansion(char_from_u16(first), char_from_u16(second));
}
}
// GIGO case
debug_assert!(false);
return Decomposed::Default;
}
let len = len_bits + 1;
if len > 2 {
break;
}
let offset24 = offset - tables.scalars16.len();
if let Some(first_c) = tables.scalars24.get(offset24) {
if len == 1 {
return Decomposed::Singleton(first_c);
}
if let Some(second_c) = tables.scalars24.get(offset24 + 1) {
return Decomposed::Expansion(first_c, second_c);
}
}
// GIGO case
debug_assert!(false);
return Decomposed::Default;
}
let non_recursive = self.non_recursive;
let non_recursive_decomposition = non_recursive.trie.get(c);
if non_recursive_decomposition == 0 {
// GIGO case
debug_assert!(false);
return Decomposed::Default;
}
let trail_or_complex = (non_recursive_decomposition >> 16) as u16;
let lead = non_recursive_decomposition as u16;
if lead != 0 && trail_or_complex != 0 {
// Decomposition into two BMP characters
return Decomposed::Expansion(char_from_u16(lead), char_from_u16(trail_or_complex));
}
if lead != 0 {
// Decomposition into one BMP character
return Decomposed::Singleton(char_from_u16(lead));
}
// Decomposition into two non-BMP characters
// Low is offset into a table plus one to keep it non-zero.
let offset = usize::from(trail_or_complex - 1);
if let Some(first) = non_recursive.scalars24.get(offset) {
if let Some(second) = non_recursive.scalars24.get(offset + 1) {
return Decomposed::Expansion(first, second);
}
}
// GIGO case
debug_assert!(false);
Decomposed::Default
}
}
/// The raw (non-recursive) canonical decomposition operation.
///
/// Callers should generally use `DecomposingNormalizer` instead of this API.
/// However, this API is provided for callers such as HarfBuzz that specifically
/// want access to non-recursive canonical decomposition e.g. for use in a
/// glyph-availability-guided custom normalizer.
#[derive(Debug)]
pub struct CanonicalDecomposition {
decompositions: DataPayload<NormalizerNfdDataV1>,
tables: DataPayload<NormalizerNfdTablesV1>,
non_recursive: DataPayload<NormalizerNfdSupplementV1>,
}
#[cfg(feature = "compiled_data")]
impl Default for CanonicalDecomposition {
fn default() -> Self {
Self::new().static_to_owned()
}
}
impl CanonicalDecomposition {
/// Constructs a borrowed version of this type for more efficient querying.
pub fn as_borrowed(&self) -> CanonicalDecompositionBorrowed<'_> {
CanonicalDecompositionBorrowed {
decompositions: self.decompositions.get(),
tables: self.tables.get(),
non_recursive: self.non_recursive.get(),
}
}
/// Construct from compiled data.
///
/// ✨ *Enabled with the `compiled_data` Cargo feature.*
///
/// [📚 Help choosing a constructor](icu_provider::constructors)
#[cfg(feature = "compiled_data")]
#[expect(clippy::new_ret_no_self)]
pub const fn new() -> CanonicalDecompositionBorrowed<'static> {
CanonicalDecompositionBorrowed::new()
}
icu_provider::gen_buffer_data_constructors!(() -> error: DataError,
functions: [
new: skip,
try_new_with_buffer_provider,
try_new_unstable,
Self,
]
);
#[doc = icu_provider::gen_buffer_unstable_docs!(UNSTABLE, Self::new)]
pub fn try_new_unstable<D>(provider: &D) -> Result<Self, DataError>
where
D: DataProvider<NormalizerNfdDataV1>
+ DataProvider<NormalizerNfdTablesV1>
+ DataProvider<NormalizerNfdSupplementV1>
+ ?Sized,
{
let decompositions: DataPayload<NormalizerNfdDataV1> =
provider.load(Default::default())?.payload;
let tables: DataPayload<NormalizerNfdTablesV1> = provider.load(Default::default())?.payload;
if tables.get().scalars16.len() + tables.get().scalars24.len() > 0xFFF {
// The data is from a future where there exists a normalization flavor whose
// complex decompositions take more than 0xFFF but fewer than 0x1FFF code points
// of space. If a good use case from such a decomposition flavor arises, we can
// dynamically change the bit masks so that the length mask becomes 0x1FFF instead
// of 0xFFF and the all-non-starters mask becomes 0 instead of 0x1000. However,
// since for now the masks are hard-coded, error out.
return Err(DataError::custom("future extension"));
}
let non_recursive: DataPayload<NormalizerNfdSupplementV1> =
provider.load(Default::default())?.payload;
Ok(CanonicalDecomposition {
decompositions,
tables,
non_recursive,
})
}
}
/// Borrowed version of lookup of the Canonical_Combining_Class Unicode property.
///
/// # Example
///
/// ```
/// use icu::properties::props::CanonicalCombiningClass;
/// use icu::normalizer::properties::CanonicalCombiningClassMapBorrowed;
///
/// let map = CanonicalCombiningClassMapBorrowed::new();
/// assert_eq!(map.get('a'), CanonicalCombiningClass::NotReordered); // U+0061: LATIN SMALL LETTER A
/// assert_eq!(map.get32(0x0301), CanonicalCombiningClass::Above); // U+0301: COMBINING ACUTE ACCENT
/// ```
#[derive(Debug)]
pub struct CanonicalCombiningClassMapBorrowed<'a> {
/// The data trie
decompositions: &'a DecompositionData<'a>,
}
#[cfg(feature = "compiled_data")]
impl Default for CanonicalCombiningClassMapBorrowed<'static> {
fn default() -> Self {
Self::new()
}
}
impl CanonicalCombiningClassMapBorrowed<'static> {
/// Cheaply converts a [`CanonicalCombiningClassMapBorrowed<'static>`] into a [`CanonicalCombiningClassMap`].
///
/// Note: Due to branching and indirection, using [`CanonicalCombiningClassMap`] might inhibit some
/// compile-time optimizations that are possible with [`CanonicalCombiningClassMapBorrowed`].
pub const fn static_to_owned(self) -> CanonicalCombiningClassMap {
CanonicalCombiningClassMap {
decompositions: DataPayload::from_static_ref(self.decompositions),
}
}
/// Construct from compiled data.
///
/// ✨ *Enabled with the `compiled_data` Cargo feature.*
///
/// [📚 Help choosing a constructor](icu_provider::constructors)
#[cfg(feature = "compiled_data")]
pub const fn new() -> Self {
CanonicalCombiningClassMapBorrowed {
decompositions: crate::provider::Baked::SINGLETON_NORMALIZER_NFD_DATA_V1,
}
}
}
impl CanonicalCombiningClassMapBorrowed<'_> {
/// Look up the canonical combining class for a scalar value.
///
/// The return value is a u8 representing the canonical combining class,
/// you may enable the `"icu_properties"` feature if you would like to use a typed
/// `CanonicalCombiningClass`.
#[inline(always)]
pub fn get_u8(&self, c: char) -> u8 {
self.get32_u8(u32::from(c))
}
/// Look up the canonical combining class for a scalar value
/// represented as `u32`. If the argument is outside the scalar
/// value range, `Not_Reordered` is returned.
///
/// The return value is a u8 representing the canonical combining class,
/// you may enable the `"icu_properties"` feature if you would like to use a typed
/// `CanonicalCombiningClass`.
pub fn get32_u8(&self, c: u32) -> u8 {
let trie_value = self.decompositions.trie.get32(c);
if trie_value_has_ccc(trie_value) {
trie_value as u8
} else {
ccc!(NotReordered, 0).to_icu4c_value()
}
}
/// Look up the canonical combining class for a scalar value
///
/// ✨ *Enabled with the `icu_properties` Cargo feature.*
#[inline(always)]
#[cfg(feature = "icu_properties")]
pub fn get(&self, c: char) -> CanonicalCombiningClass {
CanonicalCombiningClass::from_icu4c_value(self.get_u8(c))
}
/// Look up the canonical combining class for a scalar value
/// represented as `u32`. If the argument is outside the scalar
/// value range, `CanonicalCombiningClass::NotReordered` is returned.
///
/// ✨ *Enabled with the `icu_properties` Cargo feature.*
#[cfg(feature = "icu_properties")]
pub fn get32(&self, c: u32) -> CanonicalCombiningClass {
CanonicalCombiningClass::from_icu4c_value(self.get32_u8(c))
}
}
/// Lookup of the Canonical_Combining_Class Unicode property.
#[derive(Debug)]
pub struct CanonicalCombiningClassMap {
/// The data trie
decompositions: DataPayload<NormalizerNfdDataV1>,
}
#[cfg(feature = "compiled_data")]
impl Default for CanonicalCombiningClassMap {
fn default() -> Self {
Self::new().static_to_owned()
}
}
impl CanonicalCombiningClassMap {
/// Constructs a borrowed version of this type for more efficient querying.
pub fn as_borrowed(&self) -> CanonicalCombiningClassMapBorrowed<'_> {
CanonicalCombiningClassMapBorrowed {
decompositions: self.decompositions.get(),
}
}
/// Construct from compiled data.
///
/// ✨ *Enabled with the `compiled_data` Cargo feature.*
///
/// [📚 Help choosing a constructor](icu_provider::constructors)
#[cfg(feature = "compiled_data")]
#[expect(clippy::new_ret_no_self)]
pub const fn new() -> CanonicalCombiningClassMapBorrowed<'static> {
CanonicalCombiningClassMapBorrowed::new()
}
icu_provider::gen_buffer_data_constructors!(() -> error: DataError,
functions: [
new: skip,
try_new_with_buffer_provider,
try_new_unstable,
Self,
]);
#[doc = icu_provider::gen_buffer_unstable_docs!(UNSTABLE, Self::new)]
pub fn try_new_unstable<D>(provider: &D) -> Result<Self, DataError>
where
D: DataProvider<NormalizerNfdDataV1> + ?Sized,
{
let decompositions: DataPayload<NormalizerNfdDataV1> =
provider.load(Default::default())?.payload;
Ok(CanonicalCombiningClassMap { decompositions })
}
}

216
vendor/icu_normalizer/src/provider.rs vendored Normal file
View File

@@ -0,0 +1,216 @@
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
//! 🚧 \[Unstable\] Data provider struct definitions for this ICU4X component.
//!
//! <div class="stab unstable">
//! 🚧 This code is considered unstable; it may change at any time, in breaking or non-breaking ways,
//! including in SemVer minor releases. While the serde representation of data structs is guaranteed
//! to be stable, their Rust representation might not be. Use with caution.
//! </div>
//!
//! Read more about data providers: [`icu_provider`]
// Provider structs must be stable
#![allow(clippy::exhaustive_structs, clippy::exhaustive_enums)]
use icu_collections::char16trie::Char16Trie;
use icu_collections::codepointtrie::CodePointTrie;
use icu_provider::prelude::*;
use zerovec::ZeroVec;
#[cfg(feature = "compiled_data")]
#[derive(Debug)]
/// Baked data
///
/// <div class="stab unstable">
/// 🚧 This code is considered unstable; it may change at any time, in breaking or non-breaking ways,
/// including in SemVer minor releases. In particular, the `DataProvider` implementations are only
/// guaranteed to match with this version's `*_unstable` providers. Use with caution.
/// </div>
pub struct Baked;
#[cfg(feature = "compiled_data")]
#[allow(unused_imports)]
const _: () = {
use icu_normalizer_data::*;
pub mod icu {
pub use crate as normalizer;
pub use icu_collections as collections;
}
make_provider!(Baked);
impl_normalizer_nfc_v1!(Baked);
impl_normalizer_nfd_data_v1!(Baked);
impl_normalizer_nfd_supplement_v1!(Baked);
impl_normalizer_nfd_tables_v1!(Baked);
impl_normalizer_nfkd_data_v1!(Baked);
impl_normalizer_nfkd_tables_v1!(Baked);
impl_normalizer_uts46_data_v1!(Baked);
};
icu_provider::data_marker!(
/// Marker for data for canonical decomposition.
NormalizerNfdDataV1,
"normalizer/nfd/data/v1",
DecompositionData<'static>,
is_singleton = true
);
icu_provider::data_marker!(
/// Marker for additional data for canonical decomposition.
NormalizerNfdTablesV1,
"normalizer/nfd/tables/v1",
DecompositionTables<'static>,
is_singleton = true
);
icu_provider::data_marker!(
/// Marker for data for compatibility decomposition.
NormalizerNfkdDataV1,
"normalizer/nfkd/data/v1",
DecompositionData<'static>,
is_singleton = true
);
icu_provider::data_marker!(
/// Marker for additional data for compatibility decomposition.
NormalizerNfkdTablesV1,
"normalizer/nfkd/tables/v1",
DecompositionTables<'static>,
is_singleton = true
);
icu_provider::data_marker!(
/// Marker for data for UTS-46 decomposition.
NormalizerUts46DataV1,
"normalizer/uts46/data/v1",
DecompositionData<'static>,
is_singleton = true
);
icu_provider::data_marker!(
/// Marker for data for composition.
NormalizerNfcV1,
"normalizer/nfc/v1",
CanonicalCompositions<'static>,
is_singleton = true
);
icu_provider::data_marker!(
/// Marker for additional data for non-recusrsive composition.
NormalizerNfdSupplementV1,
"normalizer/nfd/supplement/v1",
NonRecursiveDecompositionSupplement<'static>,
is_singleton = true
);
#[cfg(feature = "datagen")]
/// The latest minimum set of markers required by this component.
pub const MARKERS: &[DataMarkerInfo] = &[
NormalizerNfcV1::INFO,
NormalizerNfdDataV1::INFO,
NormalizerNfdTablesV1::INFO,
NormalizerNfkdDataV1::INFO,
NormalizerNfkdTablesV1::INFO,
NormalizerNfdSupplementV1::INFO,
NormalizerUts46DataV1::INFO,
];
/// Decomposition data
///
/// <div class="stab unstable">
/// 🚧 This code is considered unstable; it may change at any time, in breaking or non-breaking ways,
/// including in SemVer minor releases. While the serde representation of data structs is guaranteed
/// to be stable, their Rust representation might not be. Use with caution.
/// </div>
#[derive(Debug, PartialEq, Clone, yoke::Yokeable, zerofrom::ZeroFrom)]
#[cfg_attr(feature = "datagen", derive(serde::Serialize, databake::Bake))]
#[cfg_attr(feature = "datagen", databake(path = icu_normalizer::provider))]
#[cfg_attr(feature = "serde", derive(serde::Deserialize))]
pub struct DecompositionData<'data> {
/// Trie for decomposition.
#[cfg_attr(feature = "serde", serde(borrow))]
pub trie: CodePointTrie<'data, u32>,
/// The passthrough bounds of NFD/NFC are lowered to this
/// maximum instead. (16-bit, because cannot be higher
/// than 0x0300, which is the bound for NFC.)
pub passthrough_cap: u16,
}
icu_provider::data_struct!(
DecompositionData<'_>,
#[cfg(feature = "datagen")]
);
/// The expansion tables for cases where the decomposition isn't
/// contained in the trie value
///
/// <div class="stab unstable">
/// 🚧 This code is considered unstable; it may change at any time, in breaking or non-breaking ways,
/// including in SemVer minor releases. While the serde representation of data structs is guaranteed
/// to be stable, their Rust representation might not be. Use with caution.
/// </div>
#[derive(Debug, PartialEq, Clone, yoke::Yokeable, zerofrom::ZeroFrom)]
#[cfg_attr(feature = "datagen", derive(serde::Serialize, databake::Bake))]
#[cfg_attr(feature = "datagen", databake(path = icu_normalizer::provider))]
#[cfg_attr(feature = "serde", derive(serde::Deserialize))]
pub struct DecompositionTables<'data> {
/// Decompositions that are fully within the BMP
#[cfg_attr(feature = "serde", serde(borrow))]
pub scalars16: ZeroVec<'data, u16>,
/// Decompositions with at least one character outside
/// the BMP
#[cfg_attr(feature = "serde", serde(borrow))]
pub scalars24: ZeroVec<'data, char>,
}
icu_provider::data_struct!(
DecompositionTables<'_>,
#[cfg(feature = "datagen")]
);
/// Non-Hangul canonical compositions
///
/// <div class="stab unstable">
/// 🚧 This code is considered unstable; it may change at any time, in breaking or non-breaking ways,
/// including in SemVer minor releases. While the serde representation of data structs is guaranteed
/// to be stable, their Rust representation might not be. Use with caution.
/// </div>
#[derive(Debug, PartialEq, Clone, yoke::Yokeable, zerofrom::ZeroFrom)]
#[cfg_attr(feature = "datagen", derive(serde::Serialize, databake::Bake))]
#[cfg_attr(feature = "datagen", databake(path = icu_normalizer::provider))]
#[cfg_attr(feature = "serde", derive(serde::Deserialize))]
pub struct CanonicalCompositions<'data> {
/// Trie keys are two-`char` strings with the second
/// character coming first. The value, if any, is the
/// (non-Hangul) canonical composition.
#[cfg_attr(feature = "serde", serde(borrow))]
pub canonical_compositions: Char16Trie<'data>,
}
icu_provider::data_struct!(
CanonicalCompositions<'_>,
#[cfg(feature = "datagen")]
);
/// Non-recursive canonical decompositions that differ from
/// `DecompositionData`.
///
/// <div class="stab unstable">
/// 🚧 This code is considered unstable; it may change at any time, in breaking or non-breaking ways,
/// including in SemVer minor releases. While the serde representation of data structs is guaranteed
/// to be stable, their Rust representation might not be. Use with caution.
/// </div>
#[derive(Debug, PartialEq, Clone, yoke::Yokeable, zerofrom::ZeroFrom)]
#[cfg_attr(feature = "datagen", derive(serde::Serialize, databake::Bake))]
#[cfg_attr(feature = "datagen", databake(path = icu_normalizer::provider))]
#[cfg_attr(feature = "serde", derive(serde::Deserialize))]
pub struct NonRecursiveDecompositionSupplement<'data> {
/// Trie for the supplementary non-recursive decompositions
#[cfg_attr(feature = "serde", serde(borrow))]
pub trie: CodePointTrie<'data, u32>,
/// Decompositions with at least one character outside
/// the BMP
#[cfg_attr(feature = "serde", serde(borrow))]
pub scalars24: ZeroVec<'data, char>,
}
icu_provider::data_struct!(
NonRecursiveDecompositionSupplement<'_>,
#[cfg(feature = "datagen")]
);

177
vendor/icu_normalizer/src/uts46.rs vendored Normal file
View File

@@ -0,0 +1,177 @@
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
//! Bundles the part of UTS 46 that makes sense to implement as a
//! normalization.
//!
//! This is meant to be used as a building block of an UTS 46
//! implementation, such as the `idna` crate.
use crate::ComposingNormalizer;
use crate::ComposingNormalizerBorrowed;
use crate::NormalizerNfcV1;
use crate::NormalizerNfdTablesV1;
use crate::NormalizerNfkdTablesV1;
use crate::NormalizerUts46DataV1;
use icu_provider::DataError;
use icu_provider::DataProvider;
// Implementation note: Despite merely wrapping a `ComposingNormalizer`,
// having a `Uts46Mapper` serves two purposes:
//
// 1. Denying public access to parts of the `ComposingNormalizer` API
// that don't work when the data contains markers for ignorables.
// 2. Providing a place where additional iterator pre-processing or
// post-processing can take place if needed in the future. (When
// writing this, it looked like such processing was needed but
// now isn't needed after all.)
/// A borrowed version of a mapper that knows how to performs the
/// subsets of UTS 46 processing documented on the methods.
#[derive(Debug)]
pub struct Uts46MapperBorrowed<'a> {
normalizer: ComposingNormalizerBorrowed<'a>,
}
#[cfg(feature = "compiled_data")]
impl Default for Uts46MapperBorrowed<'static> {
fn default() -> Self {
Self::new()
}
}
impl Uts46MapperBorrowed<'static> {
/// Cheaply converts a [`Uts46MapperBorrowed<'static>`] into a [`Uts46Mapper`].
///
/// Note: Due to branching and indirection, using [`Uts46Mapper`] might inhibit some
/// compile-time optimizations that are possible with [`Uts46MapperBorrowed`].
pub const fn static_to_owned(self) -> Uts46Mapper {
Uts46Mapper {
normalizer: self.normalizer.static_to_owned(),
}
}
/// Construct with compiled data.
#[cfg(feature = "compiled_data")]
pub const fn new() -> Self {
Uts46MapperBorrowed {
normalizer: ComposingNormalizerBorrowed::new_uts46(),
}
}
}
impl Uts46MapperBorrowed<'_> {
/// Returns an iterator adaptor that turns an `Iterator` over `char`
/// into an iterator yielding a `char` sequence that gets the following
/// operations from the "Map" and "Normalize" steps of the "Processing"
/// section of UTS 46 lazily applied to it:
///
/// 1. The _ignored_ characters are ignored.
/// 2. The _mapped_ characters are mapped.
/// 3. The _disallowed_ characters are replaced with U+FFFD,
/// which itself is a disallowed character.
/// 4. The _deviation_ characters are treated as _mapped_ or _valid_
/// as appropriate.
/// 5. The _disallowed_STD3_valid_ characters are treated as allowed.
/// 6. The _disallowed_STD3_mapped_ characters are treated as
/// _mapped_.
/// 7. The result is normalized to NFC.
///
/// Notably:
///
/// * The STD3 or WHATWG ASCII deny list should be implemented as a
/// post-processing step.
/// * Transitional processing is not performed. Transitional mapping
/// would be a pre-processing step, but transitional processing is
/// deprecated, and none of Firefox, Safari, or Chrome use it.
pub fn map_normalize<'delegate, I: Iterator<Item = char> + 'delegate>(
&'delegate self,
iter: I,
) -> impl Iterator<Item = char> + 'delegate {
self.normalizer
.normalize_iter_private(iter, crate::IgnorableBehavior::Ignored)
}
/// Returns an iterator adaptor that turns an `Iterator` over `char`
/// into an iterator yielding a `char` sequence that gets the following
/// operations from the NFC check and statucs steps of the "Validity
/// Criteria" section of UTS 46 lazily applied to it:
///
/// 1. The _ignored_ characters are treated as _disallowed_.
/// 2. The _mapped_ characters are mapped.
/// 3. The _disallowed_ characters are replaced with U+FFFD,
/// which itself is a disallowed character.
/// 4. The _deviation_ characters are treated as _mapped_ or _valid_
/// as appropriate.
/// 5. The _disallowed_STD3_valid_ characters are treated as allowed.
/// 6. The _disallowed_STD3_mapped_ characters are treated as
/// _mapped_.
/// 7. The result is normalized to NFC.
///
/// Notably:
///
/// * The STD3 or WHATWG ASCII deny list should be implemented as a
/// post-processing step.
/// * Transitional processing is not performed. Transitional mapping
/// would be a pre-processing step, but transitional processing is
/// deprecated, and none of Firefox, Safari, or Chrome use it.
/// * The output needs to be compared with input to see if anything
/// changed. This check catches failures to adhere to the normalization
/// and status requirements. In particular, this comparison results
/// in _mapped_ characters resulting in error like "Validity Criteria"
/// requires.
pub fn normalize_validate<'delegate, I: Iterator<Item = char> + 'delegate>(
&'delegate self,
iter: I,
) -> impl Iterator<Item = char> + 'delegate {
self.normalizer
.normalize_iter_private(iter, crate::IgnorableBehavior::ReplacementCharacter)
}
}
/// A mapper that knows how to performs the subsets of UTS 46 processing
/// documented on the methods.
#[derive(Debug)]
pub struct Uts46Mapper {
normalizer: ComposingNormalizer,
}
#[cfg(feature = "compiled_data")]
impl Default for Uts46Mapper {
fn default() -> Self {
Self::new().static_to_owned()
}
}
impl Uts46Mapper {
/// Constructs a borrowed version of this type for more efficient querying.
pub fn as_borrowed(&self) -> Uts46MapperBorrowed<'_> {
Uts46MapperBorrowed {
normalizer: self.normalizer.as_borrowed(),
}
}
/// Construct with compiled data.
#[cfg(feature = "compiled_data")]
#[expect(clippy::new_ret_no_self)]
pub const fn new() -> Uts46MapperBorrowed<'static> {
Uts46MapperBorrowed::new()
}
/// Construct with provider.
#[doc = icu_provider::gen_buffer_unstable_docs!(UNSTABLE, Self::new)]
pub fn try_new<D>(provider: &D) -> Result<Self, DataError>
where
D: DataProvider<NormalizerUts46DataV1>
+ DataProvider<NormalizerNfdTablesV1>
+ DataProvider<NormalizerNfkdTablesV1>
// UTS 46 tables merged into NormalizerNfkdTablesV1
+ DataProvider<NormalizerNfcV1>
+ ?Sized,
{
let normalizer = ComposingNormalizer::try_new_uts46_unstable(provider)?;
Ok(Uts46Mapper { normalizer })
}
}

View File

@@ -0,0 +1,4 @@
# This is a placeholder in the interest of keeping the repository size smaller.
# Replace this file with the contents of
# https://www.unicode.org/Public/UCD/latest/ucd/NormalizationTest.txt to actually
# run the conformance test.

View File

@@ -0,0 +1,2 @@
The test data comes from
https://www.unicode.org/Public/UCD/latest/ucd/NormalizationTest.txt

2083
vendor/icu_normalizer/tests/tests.rs vendored Normal file

File diff suppressed because it is too large Load Diff