chore: checkpoint before Python removal

This commit is contained in:
2026-03-26 22:33:59 +00:00
parent 683cec9307
commit e568ddf82a
29972 changed files with 11269302 additions and 2 deletions

View File

@@ -0,0 +1 @@
{"files":{".cargo_vcs_info.json":"af46ddb44f387baf3e14a4e0f2d025ea7da570f030d16f8c728319f194ea47be",".cirrus.yml":"7971c090d58cdc63ffa3db104772a221b443313a3588c7e6e1b8fbfc5dc9dcf5","AUTHORS.md":"f2cf336738ad935a482a799be004083ddd07c904513caf80f9e48011888fe1b6","Cargo.lock":"0456988b6573b06b7ebfbcad4457400a51b0fc5b6be72aa22440ff7f54af8314","Cargo.toml":"9954655fa5288b59d81cc2f66c2d20e415b808179c6434de1d9273a67150f3cc","Cargo.toml.orig":"38de48b85fa16a4ee9239e96ffc0447edd851b15a9b12b5aa923261a6f8fb565","LICENSE-APACHE":"cfc7749b96f63bd31c3c42b5c471bf756814053e847c10f3eb003417bc523d30","LICENSE-MIT":"da23be69ad3ccf7a5823d62152efcd2b15de9482a5014fcb1b02844662d86abd","README.md":"71c99222cb1e62b8f2c2e084caf6405150b57c8bf496c59b1636d63aa2e097d4","RELEASES.md":"2fde9ac9a45170936dcf66085fe7aa362d8536b1390a5f07f59e4f744fbf0d3d","benches/length.rs":"fb509f67ef24377f31c77d0e96ea560c5139307702dfe8cff4d2369176d9ff94","benches/multiiterators.rs":"6589943a33a75d6a56aacf2cc75ef2bb23328a509a57c99589ec078ef77ac7c7","do.sh":"028ba8c5c8e5b4cdc5459fceb025a66c6ad67c1e93c7a41a84bb76c5019716c8","examples/length_distribution.rs":"325ffa59c712d20c257a320a2aa5cd5624a6eb3d67ddc81d6012440e511865ec","src/decoding_iterators.rs":"c36b508273325b8dc589fd28dbd8931ee7de58412e2d0eb8d2da6fcf80803260","src/errors.rs":"d4a71b225c5f8da8bef6673ae5a77064a45c74f57b52a2b1f67685eb0e67b716","src/lib.rs":"e0cf77b9324d619c500d78f406f3be6b7fd10a06fd183397361e3e2381c57c5d","src/traits.rs":"b0b4e6b02834e13d9e1b062f1df540571f25fd290b4bbcd09ec778d7582e5f8f","src/utf16_char.rs":"f1e21f9ba8785441c5afe5e7b78a480c3f6e9b7f5208d56f10ce1421e9309ef1","src/utf16_iterators.rs":"c9967e945d68c5381404451432b889124b431c3eeb7ef72ec64821dfc988094a","src/utf8_char.rs":"2e35014e790d0cabda4e1926724d8ebb83c88215e6a98ed3cb3d7c5ae74454ee","src/utf8_iterators.rs":"114e2fe000a0366251442ece23d842f82828d0b9df0c6f7caed17a51e1a92c25","tests/errs.rs":"f68f76f0b23e51c227db382cfbce38cc401be562079dca73da52fe12dda299a7","tests/exhaustive.rs":"166e397c5d4d636266f33d9fd0fbe4ca9595783a44691ca51e48f54c6a751b76","tests/iterators.rs":"25b75d0cbc086e8605840f66a42b9a2178ee060fdca2b8692616bdfc9e3425b4","tests/oks.rs":"0e3aabfe3cba6105d28b26d7bf3205caa2a345203e424a7c192d53327d7857ac"},"package":"34aa73646ffb006b8f5147f3dc182bd4bcb190227ce861fc4a4844bf8e3cb2c0"}

View File

@@ -0,0 +1,6 @@
{
"git": {
"sha1": "b764bc1cec904d2198b052c5a564963a930a0bcb"
},
"path_in_vcs": ""
}

90
vendor/encode_unicode/.cirrus.yml vendored Normal file
View File

@@ -0,0 +1,90 @@
task:
name: stable
container:
image: rust
cpu: 1
memory: 1G
allow_failures: false
env:
RUST_BACKTRACE: 1
cargo_cache:
folder: $HOME/.cargo/registry
fingerprint_script: cat Cargo.lock 2> /dev/null || true
target_cache:
folder: target
fingerprint_script: cat Cargo.lock 2> /dev/null || true
setup_script:
- rustup component add clippy
info_script:
- rustc --version
check_script:
- cargo check --examples --tests --no-default-features
- cargo check --examples --tests --no-default-features --features std
- cargo check --examples --tests --no-default-features --features ascii
- cargo build --examples --tests --all-features
- cargo clippy --tests --examples --all-features
test_script:
- cargo test --all-features --no-fail-fast -- --test-threads=1
before_cache_script:
- rm -rf $HOME/.cargo/registry/index
task:
name: MSRV
container:
image: rust:1.56
cpu: 1
memory: 1G
allow_failures: false
env:
RUST_BACKTRACE: 1
cargo_cache:
folder: $HOME/.cargo/registry
fingerprint_script: cat Cargo.lock 2> /dev/null || true
target_cache:
folder: target
fingerprint_script: cat Cargo.lock 2> /dev/null || true
info_script:
- rustc --version
build_script:
# Lock to the specified minor versions of dependencies
# to test that they work with our MSRV.
# But that doesn't cover recursive dependencies,
# so avoid checking examples and tests because they build dev dependencies.
# Tests and examples don't need to work at MSRV anyway.
- sed -i 's/"^/"~/' Cargo.toml
- cargo check --no-default-features
- cargo check --no-default-features --features std
- cargo check --no-default-features --features ascii
- cargo check --all-features
before_cache_script:
- rm -rf $HOME/.cargo/registry/index
task:
name: nightly
container:
image: rustlang/rust:nightly
cpu: 1
memory: 1G
allow_failures: false
cargo_cache:
folder: $HOME/.cargo/registry
fingerprint_script: cat Cargo.lock 2> /dev/null || true
# rustc version is so likely to have changed that build artefacts are not worth caching
setup_script:
- cargo install cargo-fuzz
- rustup component add miri
info_script:
- rustc --version
check_script:
- cargo check --benches --no-default-features
- cargo check --benches --no-default-features --features std
- cargo check --benches --no-default-features --features ascii
- cargo build --benches --all-features
- cargo fuzz build
# fuzz supports feature selection,
# but --no-default-features doesn't seem to have any effect
test_script:
# the doc tets are fast and should cover a lot of code
- cargo miri test --all-features --doc -- --test-threads=1
before_cache_script:
- rm -rf $HOME/.cargo/registry/index

4
vendor/encode_unicode/AUTHORS.md vendored Normal file
View File

@@ -0,0 +1,4 @@
# The encode_unicode Developers
* Torbjørn Birch Moltu
* Aljoscha Meyer

368
vendor/encode_unicode/Cargo.lock generated vendored Normal file
View File

@@ -0,0 +1,368 @@
# This file is automatically @generated by Cargo.
# It is not intended for manual editing.
version = 3
[[package]]
name = "ascii"
version = "1.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "bbf56136a5198c7b01a49e3afcbef6cf84597273d298f54432926024107b0109"
[[package]]
name = "autocfg"
version = "1.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa"
[[package]]
name = "bitflags"
version = "1.3.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a"
[[package]]
name = "cc"
version = "1.0.73"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2fff2a6927b3bb87f9595d67196a70493f627687a71d87a0d692242c33f58c11"
[[package]]
name = "cfg-if"
version = "1.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"
[[package]]
name = "core-foundation"
version = "0.9.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "194a7a9e6de53fa55116934067c844d9d749312f75c6f6d0980e8c252f8c2146"
dependencies = [
"core-foundation-sys",
"libc",
]
[[package]]
name = "core-foundation-sys"
version = "0.8.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5827cebf4670468b8772dd191856768aedcb1b0278a04f989f7766351917b9dc"
[[package]]
name = "encode_unicode"
version = "1.0.0"
dependencies = [
"ascii",
"lazy_static",
"minreq",
]
[[package]]
name = "fastrand"
version = "1.8.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a7a407cfaa3385c4ae6b23e84623d48c2798d06e3e6a1878f7f59f17b3f86499"
dependencies = [
"instant",
]
[[package]]
name = "foreign-types"
version = "0.3.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f6f339eb8adc052cd2ca78910fda869aefa38d22d5cb648e6485e4d3fc06f3b1"
dependencies = [
"foreign-types-shared",
]
[[package]]
name = "foreign-types-shared"
version = "0.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "00b0228411908ca8685dba7fc2cdd70ec9990a6e753e89b6ac91a84c40fbaf4b"
[[package]]
name = "instant"
version = "0.1.12"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7a5bbe824c507c5da5956355e86a746d82e0e1464f65d862cc5e71da70e94b2c"
dependencies = [
"cfg-if",
]
[[package]]
name = "lazy_static"
version = "1.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646"
[[package]]
name = "libc"
version = "0.2.127"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "505e71a4706fa491e9b1b55f51b95d4037d0821ee40131190475f692b35b009b"
[[package]]
name = "log"
version = "0.4.17"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "abb12e687cfb44aa40f41fc3978ef76448f9b6038cad6aef4259d3c095a2382e"
dependencies = [
"cfg-if",
]
[[package]]
name = "minreq"
version = "2.6.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4c785bc6027fd359756e538541c8624012ba3776d3d3fe123885643092ed4132"
dependencies = [
"log",
"native-tls",
]
[[package]]
name = "native-tls"
version = "0.2.10"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "fd7e2f3618557f980e0b17e8856252eee3c97fa12c54dff0ca290fb6266ca4a9"
dependencies = [
"lazy_static",
"libc",
"log",
"openssl",
"openssl-probe",
"openssl-sys",
"schannel",
"security-framework",
"security-framework-sys",
"tempfile",
]
[[package]]
name = "once_cell"
version = "1.13.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "18a6dbe30758c9f83eb00cbea4ac95966305f5a7772f3f42ebfc7fc7eddbd8e1"
[[package]]
name = "openssl"
version = "0.10.41"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "618febf65336490dfcf20b73f885f5651a0c89c64c2d4a8c3662585a70bf5bd0"
dependencies = [
"bitflags",
"cfg-if",
"foreign-types",
"libc",
"once_cell",
"openssl-macros",
"openssl-sys",
]
[[package]]
name = "openssl-macros"
version = "0.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b501e44f11665960c7e7fcf062c7d96a14ade4aa98116c004b2e37b5be7d736c"
dependencies = [
"proc-macro2",
"quote",
"syn",
]
[[package]]
name = "openssl-probe"
version = "0.1.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ff011a302c396a5197692431fc1948019154afc178baf7d8e37367442a4601cf"
[[package]]
name = "openssl-sys"
version = "0.9.75"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e5f9bd0c2710541a3cda73d6f9ac4f1b240de4ae261065d309dbe73d9dceb42f"
dependencies = [
"autocfg",
"cc",
"libc",
"pkg-config",
"vcpkg",
]
[[package]]
name = "pkg-config"
version = "0.3.25"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1df8c4ec4b0627e53bdf214615ad287367e482558cf84b109250b37464dc03ae"
[[package]]
name = "proc-macro2"
version = "1.0.43"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0a2ca2c61bc9f3d74d2886294ab7b9853abd9c1ad903a3ac7815c58989bb7bab"
dependencies = [
"unicode-ident",
]
[[package]]
name = "quote"
version = "1.0.21"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "bbe448f377a7d6961e30f5955f9b8d106c3f5e449d493ee1b125c1d43c2b5179"
dependencies = [
"proc-macro2",
]
[[package]]
name = "redox_syscall"
version = "0.2.16"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "fb5a58c1855b4b6819d59012155603f0b22ad30cad752600aadfcb695265519a"
dependencies = [
"bitflags",
]
[[package]]
name = "remove_dir_all"
version = "0.5.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3acd125665422973a33ac9d3dd2df85edad0f4ae9b00dafb1a05e43a9f5ef8e7"
dependencies = [
"winapi",
]
[[package]]
name = "schannel"
version = "0.1.20"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "88d6731146462ea25d9244b2ed5fd1d716d25c52e4d54aa4fb0f3c4e9854dbe2"
dependencies = [
"lazy_static",
"windows-sys",
]
[[package]]
name = "security-framework"
version = "2.6.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2dc14f172faf8a0194a3aded622712b0de276821addc574fa54fc0a1167e10dc"
dependencies = [
"bitflags",
"core-foundation",
"core-foundation-sys",
"libc",
"security-framework-sys",
]
[[package]]
name = "security-framework-sys"
version = "2.6.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0160a13a177a45bfb43ce71c01580998474f556ad854dcbca936dd2841a5c556"
dependencies = [
"core-foundation-sys",
"libc",
]
[[package]]
name = "syn"
version = "1.0.99"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "58dbef6ec655055e20b86b15a8cc6d439cca19b667537ac6a1369572d151ab13"
dependencies = [
"proc-macro2",
"quote",
"unicode-ident",
]
[[package]]
name = "tempfile"
version = "3.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5cdb1ef4eaeeaddc8fbd371e5017057064af0911902ef36b39801f67cc6d79e4"
dependencies = [
"cfg-if",
"fastrand",
"libc",
"redox_syscall",
"remove_dir_all",
"winapi",
]
[[package]]
name = "unicode-ident"
version = "1.0.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c4f5b37a154999a8f3f98cc23a628d850e154479cd94decf3414696e12e31aaf"
[[package]]
name = "vcpkg"
version = "0.2.15"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "accd4ea62f7bb7a82fe23066fb0957d48ef677f6eeb8215f372f52e48bb32426"
[[package]]
name = "winapi"
version = "0.3.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419"
dependencies = [
"winapi-i686-pc-windows-gnu",
"winapi-x86_64-pc-windows-gnu",
]
[[package]]
name = "winapi-i686-pc-windows-gnu"
version = "0.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6"
[[package]]
name = "winapi-x86_64-pc-windows-gnu"
version = "0.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f"
[[package]]
name = "windows-sys"
version = "0.36.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ea04155a16a59f9eab786fe12a4a450e75cdb175f9e0d80da1e17db09f55b8d2"
dependencies = [
"windows_aarch64_msvc",
"windows_i686_gnu",
"windows_i686_msvc",
"windows_x86_64_gnu",
"windows_x86_64_msvc",
]
[[package]]
name = "windows_aarch64_msvc"
version = "0.36.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9bb8c3fd39ade2d67e9874ac4f3db21f0d710bee00fe7cab16949ec184eeaa47"
[[package]]
name = "windows_i686_gnu"
version = "0.36.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "180e6ccf01daf4c426b846dfc66db1fc518f074baa793aa7d9b9aaeffad6a3b6"
[[package]]
name = "windows_i686_msvc"
version = "0.36.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e2e7917148b2812d1eeafaeb22a97e4813dfa60a3f8f78ebe204bcc88f12f024"
[[package]]
name = "windows_x86_64_gnu"
version = "0.36.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4dcd171b8776c41b97521e5da127a2d86ad280114807d0b2ab1e462bc764d9e1"
[[package]]
name = "windows_x86_64_msvc"
version = "0.36.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c811ca4a8c853ef420abd8592ba53ddbbac90410fab6903b3e79972a631f7680"

59
vendor/encode_unicode/Cargo.toml vendored Normal file
View File

@@ -0,0 +1,59 @@
# THIS FILE IS AUTOMATICALLY GENERATED BY CARGO
#
# When uploading crates to the registry Cargo will automatically
# "normalize" Cargo.toml files for maximal compatibility
# with all versions of Cargo and also rewrite `path` dependencies
# to registry (e.g., crates.io) dependencies.
#
# If you are reading this file be aware that the original Cargo.toml
# will likely look very different (and much more reasonable).
# See Cargo.toml.orig for the original contents.
[package]
edition = "2021"
name = "encode_unicode"
version = "1.0.0"
authors = ["Torbjørn Birch Moltu <t.b.moltu@lyse.net>"]
description = """
UTF-8 and UTF-16 character types, iterators and related methods for char, u8 and u16.
"""
documentation = "https://docs.rs/encode_unicode/"
readme = "README.md"
keywords = [
"unicode",
"UTF-8",
"UTF-16",
]
categories = [
"encoding",
"no-std",
]
license = "Apache-2.0 OR MIT"
repository = "https://github.com/tormol/encode_unicode"
resolver = "2"
[package.metadata.docs.rs]
features = ["ascii/std"]
[[bench]]
name = "length"
required-features = ["std"]
[dependencies.ascii]
version = "^1.0.0"
optional = true
default-features = false
[dev-dependencies.minreq]
version = "^2.6"
features = ["https-native"]
[features]
default = ["std"]
std = []
[target."cfg(unix)".dev-dependencies.lazy_static]
version = "^1.0"
[badges.maintenance]
status = "passively-maintained"

202
vendor/encode_unicode/LICENSE-APACHE vendored Normal file
View File

@@ -0,0 +1,202 @@
Apache License
Version 2.0, January 2004
http://www.apache.org/licenses/
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
1. Definitions.
"License" shall mean the terms and conditions for use, reproduction,
and distribution as defined by Sections 1 through 9 of this document.
"Licensor" shall mean the copyright owner or entity authorized by
the copyright owner that is granting the License.
"Legal Entity" shall mean the union of the acting entity and all
other entities that control, are controlled by, or are under common
control with that entity. For the purposes of this definition,
"control" means (i) the power, direct or indirect, to cause the
direction or management of such entity, whether by contract or
otherwise, or (ii) ownership of fifty percent (50%) or more of the
outstanding shares, or (iii) beneficial ownership of such entity.
"You" (or "Your") shall mean an individual or Legal Entity
exercising permissions granted by this License.
"Source" form shall mean the preferred form for making modifications,
including but not limited to software source code, documentation
source, and configuration files.
"Object" form shall mean any form resulting from mechanical
transformation or translation of a Source form, including but
not limited to compiled object code, generated documentation,
and conversions to other media types.
"Work" shall mean the work of authorship, whether in Source or
Object form, made available under the License, as indicated by a
copyright notice that is included in or attached to the work
(an example is provided in the Appendix below).
"Derivative Works" shall mean any work, whether in Source or Object
form, that is based on (or derived from) the Work and for which the
editorial revisions, annotations, elaborations, or other modifications
represent, as a whole, an original work of authorship. For the purposes
of this License, Derivative Works shall not include works that remain
separable from, or merely link (or bind by name) to the interfaces of,
the Work and Derivative Works thereof.
"Contribution" shall mean any work of authorship, including
the original version of the Work and any modifications or additions
to that Work or Derivative Works thereof, that is intentionally
submitted to Licensor for inclusion in the Work by the copyright owner
or by an individual or Legal Entity authorized to submit on behalf of
the copyright owner. For the purposes of this definition, "submitted"
means any form of electronic, verbal, or written communication sent
to the Licensor or its representatives, including but not limited to
communication on electronic mailing lists, source code control systems,
and issue tracking systems that are managed by, or on behalf of, the
Licensor for the purpose of discussing and improving the Work, but
excluding communication that is conspicuously marked or otherwise
designated in writing by the copyright owner as "Not a Contribution."
"Contributor" shall mean Licensor and any individual or Legal Entity
on behalf of whom a Contribution has been received by Licensor and
subsequently incorporated within the Work.
2. Grant of Copyright License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
copyright license to reproduce, prepare Derivative Works of,
publicly display, publicly perform, sublicense, and distribute the
Work and such Derivative Works in Source or Object form.
3. Grant of Patent License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
(except as stated in this section) patent license to make, have made,
use, offer to sell, sell, import, and otherwise transfer the Work,
where such license applies only to those patent claims licensable
by such Contributor that are necessarily infringed by their
Contribution(s) alone or by combination of their Contribution(s)
with the Work to which such Contribution(s) was submitted. If You
institute patent litigation against any entity (including a
cross-claim or counterclaim in a lawsuit) alleging that the Work
or a Contribution incorporated within the Work constitutes direct
or contributory patent infringement, then any patent licenses
granted to You under this License for that Work shall terminate
as of the date such litigation is filed.
4. Redistribution. You may reproduce and distribute copies of the
Work or Derivative Works thereof in any medium, with or without
modifications, and in Source or Object form, provided that You
meet the following conditions:
(a) You must give any other recipients of the Work or
Derivative Works a copy of this License; and
(b) You must cause any modified files to carry prominent notices
stating that You changed the files; and
(c) You must retain, in the Source form of any Derivative Works
that You distribute, all copyright, patent, trademark, and
attribution notices from the Source form of the Work,
excluding those notices that do not pertain to any part of
the Derivative Works; and
(d) If the Work includes a "NOTICE" text file as part of its
distribution, then any Derivative Works that You distribute must
include a readable copy of the attribution notices contained
within such NOTICE file, excluding those notices that do not
pertain to any part of the Derivative Works, in at least one
of the following places: within a NOTICE text file distributed
as part of the Derivative Works; within the Source form or
documentation, if provided along with the Derivative Works; or,
within a display generated by the Derivative Works, if and
wherever such third-party notices normally appear. The contents
of the NOTICE file are for informational purposes only and
do not modify the License. You may add Your own attribution
notices within Derivative Works that You distribute, alongside
or as an addendum to the NOTICE text from the Work, provided
that such additional attribution notices cannot be construed
as modifying the License.
You may add Your own copyright statement to Your modifications and
may provide additional or different license terms and conditions
for use, reproduction, or distribution of Your modifications, or
for any such Derivative Works as a whole, provided Your use,
reproduction, and distribution of the Work otherwise complies with
the conditions stated in this License.
5. Submission of Contributions. Unless You explicitly state otherwise,
any Contribution intentionally submitted for inclusion in the Work
by You to the Licensor shall be under the terms and conditions of
this License, without any additional terms or conditions.
Notwithstanding the above, nothing herein shall supersede or modify
the terms of any separate license agreement you may have executed
with Licensor regarding such Contributions.
6. Trademarks. This License does not grant permission to use the trade
names, trademarks, service marks, or product names of the Licensor,
except as required for reasonable and customary use in describing the
origin of the Work and reproducing the content of the NOTICE file.
7. Disclaimer of Warranty. Unless required by applicable law or
agreed to in writing, Licensor provides the Work (and each
Contributor provides its Contributions) on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied, including, without limitation, any warranties or conditions
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
PARTICULAR PURPOSE. You are solely responsible for determining the
appropriateness of using or redistributing the Work and assume any
risks associated with Your exercise of permissions under this License.
8. Limitation of Liability. In no event and under no legal theory,
whether in tort (including negligence), contract, or otherwise,
unless required by applicable law (such as deliberate and grossly
negligent acts) or agreed to in writing, shall any Contributor be
liable to You for damages, including any direct, indirect, special,
incidental, or consequential damages of any character arising as a
result of this License or out of the use or inability to use the
Work (including but not limited to damages for loss of goodwill,
work stoppage, computer failure or malfunction, or any and all
other commercial damages or losses), even if such Contributor
has been advised of the possibility of such damages.
9. Accepting Warranty or Additional Liability. While redistributing
the Work or Derivative Works thereof, You may choose to offer,
and charge a fee for, acceptance of support, warranty, indemnity,
or other liability obligations and/or rights consistent with this
License. However, in accepting such obligations, You may act only
on Your own behalf and on Your sole responsibility, not on behalf
of any other Contributor, and only if You agree to indemnify,
defend, and hold each Contributor harmless for any liability
incurred by, or claims asserted against, such Contributor by reason
of your accepting any such warranty or additional liability.
END OF TERMS AND CONDITIONS
APPENDIX: How to apply the Apache License to your work.
To apply the Apache License to your work, attach the following
boilerplate notice, with the fields enclosed by brackets "[]"
replaced with your own identifying information. (Don't include
the brackets!) The text should be enclosed in the appropriate
comment syntax for the file format. We also recommend that a
file or class name and description of purpose be included on the
same "printed page" as the copyright notice for easier
identification within third-party archives.
Copyright [yyyy] [name of copyright owner]
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.

17
vendor/encode_unicode/LICENSE-MIT vendored Normal file
View File

@@ -0,0 +1,17 @@
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE

71
vendor/encode_unicode/README.md vendored Normal file
View File

@@ -0,0 +1,71 @@
# encode_unicode
UTF-8 and UTF-16 character types, iterators and related methods for `char`, `u8` and `u16`.
[![crates.io page](https://img.shields.io/crates/v/encode_unicode.svg)](https://crates.io/crates/encode_unicode/) ![License: Apache-2 or MIT](https://img.shields.io/crates/l/encode_unicode.svg) [![Documentation on docs.rs](https://docs.rs/encode_unicode/badge.svg)](https://docs.rs/encode_unicode/) [![CI build status](https://api.cirrus-ci.com/github/tormol/encode_unicode.svg)](https://cirrus-ci.com/github/tormol/encode_unicode)
## Features
* **[`Utf8Char`](https://docs.rs/encode_unicode/latest/encode_unicode/struct.Utf8Char.html)**:
A `char` stored as UTF-8. Can be borrowed as a `str` or `u8` slice.
* **[`Utf16Char`](https://docs.rs/encode_unicode/latest/encode_unicode/struct.Utf16Char.html)**:
A `char` stored as UTF-16. Can be borrowed as an `u16` slice.
* [Conversion methods on `char`](https://docs.rs/encode_unicode/latest/encode_unicode/trait.CharExt.html):
* to and from UTF-8 as `[u8; 4]` or slice.
* to and from UTF-16 as `(u16, Option<u16>)` or slice.
* [Iterator adapters](https://docs.rs/encode_unicode/latest/encode_unicode/trait.IterExt.html)
for converting betwenn `u8`s and `Utf8Char`s or `u16`s and `Utf16Char`s.
* Optimized [slice-based decoding iterators](https://docs.rs/encode_unicode/latest/encode_unicode/trait.SliceExt.html).
* [Precise errors when decoding a char from UTF-8, UTF-16 or `u32` fails](http://docs.rs/encode_unicode/latest/encode_unicode/error/index.html).
* Utility methods on [`u8`](https://docs.rs/encode_unicode/latest/encode_unicode/trait.U8UtfExt.html)
and [`u16`](https://docs.rs/encode_unicode/latest/encode_unicode/trait.U16UtfExt.html).
## Minimum supported Rust version
The minimum supported Rust version for 1.0.\* releases is 1.56.
Later 1.y.0 releases might require newer Rust versions, but the three most
recent stable releases at the time of publishing will always be supported.
For example this means that if the current stable Rust version is 1.66 when
encode_unicode 1.1.0 is released, then encode_unicode 1.1.\* will
not require a newer Rust version than 1.63.
## Optional features
* `#![no_std]`-mode: There are a few differences:
* `Error` doesn't exist, but `description()` is made available as an inherent impl.
* `Extend`/`FromIterator`-implementations for `String`/`Vec<u8>`/`Vec<u16>` are missing.
* There is no `io`, so `Utf8Iterator` and `Utf8CharSplitter` doesn't implement `Read`.
This feature is enabled by setting `default-features=false` in `Cargo.toml`:
`encode_unicode = {version="0.3.4", default-features=false}`.
* Integration with the [ascii](https://tomprogrammer.github.io/rust-ascii/ascii/index.html) crate:
Convert `Utf8Char` and `Utf16Char` to and from [ascii::`AsciiChar`](https://tomprogrammer.github.io/rust-ascii/ascii/enum.AsciiChar.html).
## License
Licensed under either of
* Apache License, Version 2.0, ([LICENSE-APACHE](LICENSE-APACHE) or http://www.apache.org/licenses/LICENSE-2.0)
* MIT license ([LICENSE-MIT](LICENSE-MIT) or http://opensource.org/licenses/MIT)
at your option.
### Contribution
Unless you explicitly state otherwise, any contribution intentionally
submitted for inclusion in the work by you, as defined in the Apache-2.0
license, shall be dual licensed as above, without any additional terms or
conditions.
## Developing
`do.sh` can be used to check all feature combinations, test everything, show output from benchmarks in case setup fails, run fuzz tests for a while and lint everything (except fuzz tests).
It assumes [rustup](https://rustup.rs) is installed and that [`cargo +release`](https://rust-lang.github.io/rustup/concepts/index.html#how-rustup-works) works.
(It is named the way it is to autocomplete fully from the first character after `./`.)
## History
The original purpose of this crate was to provide standins for the then
unstable `encode_utf8()` and `encode_utf16()` methods on `char`.
The standins were removed in version 0.3 when Rust 1.15 stabilized the
`encode_` methods, but the other stuff I added, such as iterators like
those `encode_utf{8,16}()` returned for a while, might still be of use.

104
vendor/encode_unicode/RELEASES.md vendored Normal file
View File

@@ -0,0 +1,104 @@
Version 1.0.0 (2022-08-07)
==========================
* Replace error types `InvalidUtf8Array`, `InvalidUtf8Slice`, `InvalidUtf8FirstByte` and `InvalidUtf8` with `Utf8Error` plus `Utf8ErrorKind`.
Which of the new error kind variants is reported don't map 1:1 to the old enum variants:
For example `Utf8ErrorKind::NonUtf8Byte` is returned for sequences that would previously have been reported as too high codepoint or overlong encoding.
* Rename many other error types for consistency:
* `InvalidCodepoint` -> `CodepointError`
* `InvalidUtf16FirstUnit` -> `Utf16FirstUnitError`
* `InvalidUtf16Array` -> `Utf16ArrayError`
* `InvalidUtf16Slice` -> `Utf16SliceError`
* `1InvalidUtf16Tuple` -> `Utf16TupleError`
* Change return type of `CodepointError::error_range()` to `RangeInclusive`.
* Rename some errors variants:
* `Utf16SliceError::FirstLowSurrogate` -> `FirstIsTrailingSurrogate`
* `Utf16SliceError::SecondNotLowSurrogate` -> `SecondIsNotTrailingSurrogate`
* `Utf16TupleError::InvalidSecond` -> `SecondIsNotTrailingSurrogate`
* Expose the error type of `Utf16Char::from_bmp()` and rename it to `NonBmpError`.
* Remove re-exports of `Utf8CharIterator` and `Utf16CharIterator` from the crate root.
(They are still exposed via the `iterator` module.)
* Remove impls of the deprecated `AsciiExt` trait,
and make the methods available in `#![no_std]`-mode.
* Make many of the previously `AsciiExt` methods take self by value.
* Drop support for pre-1.0 versions of the ascii crate.
* Remove `iter_bytes()` and `iter_units()`.
* Increase minimum Rust version to 1.56 and change the minimum Rust version policy.
* Fix possible UB or panic in `Utf8Char::from_slice_start_unchecked()` when passed an empty slice.
(relates to [#12](https://github.com/tormol/encode_unicode/issues/12).)
* Make many methods `const fn`.
* Add `const fn`s `Utf8Char::new()` and `Utf16Char::new()`.
Version 0.3.6 (2019-08-23)
==========================
* Fix pointless undefined behavior in `Utf16Char.to_ascii_char()` (which is part of ascii feature)
* Widen ascii version requirement to include 1.\*.
* Add `[u16; 2]` UTF-16 array alternatives to `(u16, Some(u16))` UTF-16 tuple methods.
* Add `Utf16Char.is_bmp()`.
Version 0.3.5 (2018-10-23)
==========================
* Fix docs.rs build failure
Version 0.3.4 (2018-10-23)
==========================
* Fix UB in UTF-8 validation which lead to invalid codepoints being accepted in release mode.
* Add fallible decoding iterator adapters `Utf8CharMerger` and `Utf16CharMerger`
and slice-based iterators `Utf8CharDecoder` and `Utf16CharDecoder`
* Widen ascii version requirement from 0.8.\* to 0.8.0 - 0.10.\*
* Implement creating / extending `String`s from `Utf16Char`-producing iterators
Version 0.3.3 (2018-10-16)
==========================
* Fix UTF-8 overlong check. (`from_array()` and `from_slice()` accepted two-byte encodings of ASCII characters >= '@', which includes all letters)
* Implement `FromStr` for `Utf16Char`
* Add `from_str_start()` to `Utf8Char` and `Utf16Char`
* Add `Utf{8,16}Char{s,Indices}`: `str`-based iterators for `Utf8Char` and `Utf16Char` equivalent to `char`'s `Chars` and `CharIndices`.
* Add `StrExt` with functions to create the above iterators.
* Implement `FromIterator` and `Extend` for `Vec<{u8,u16}>` with reference-producing `Utf{8,16}Char` iterators too.
* Add `Utf8CharSplitter` and `Utf16CharSplitter`: `Utf{8,16}Char`-to-`u{8,16}` iterator adapters.
* Add `IterExt`, `iter_bytes()` and `iter_units()` to create the above splitting iterators.
* Add `Utf8Char::from_ascii()`, `Utf16Char::from_bmp()` with `_unchecked` versions of both.
* Add cross-type `PartialEq` and `PartialOrd` implementations.
* Change the `description()` for a few error types.
Version 0.3.2 (2018-08-08)
==========================
* Hide `AsciiExt` deprecation warning and add replacement methods.
* Correct documentation for `U8UtfExt::extra_utf8_bytes()`.
* Fix misspellings in some error descriptions.
* Avoid potentially bad transmutes.
Version 0.3.1 (2017-06-16)
==========================
* Implement `Display` for `Utf8Char` and `Utf16Char`.
Version 0.3.0 (2017-03-29)
==========================
* Replace the "no_std" feature with opt-out "std".
* Upgrade ascii to v0.8.
* Make tests compile on stable.
* Remove `CharExt::write_utf{8,16}()` because `encode_utf{8,16}()` has been stabilized.
* Return a proper error from `U16UtfExt::utf16_needs_extra_unit()` instead of `None`.
* Rename `U16UtfExt::utf_is_leading_surrogate()` to `is_utf16_leading_surrogate()`.
* Rename `Utf16Char::from_slice()` to `from_slice_start()` and `CharExt::from_utf{8,16}_slice()`
to `from_utf{8,16}_slice_start()` to be consistent with `Utf8Char`.
* Fix a bug where `CharExt::from_slice()` would accept some trailing surrogates
as standalone codepoints.
Version 0.2.0 (2016-07-24)
==========================
* Change `CharExt::write_utf{8,16}()` to panic instead of returning `None`
if the slice is too short.
* Fix bug where `CharExt::write_utf8()` and `Utf8Char::to_slice()` could change bytes it shouldn't.
* Rename lots of errors with search and replace:
* CodePoint -> Codepoint
* Several -> Multiple
* Update the ascii feature to use [ascii](https://tomprogrammer.github.io/rust-ascii/ascii/index.html) v0.7.
* Support `#[no_std]`; see 70e090ee for differences.
* Ungate impls of `AsciiExt`. (doesn't require ascii or nightly)
* Make the tests compile (and pass) again.
(They still require nightly).
Version 0.1.* (2016-04-07)
==========================
First release.

282
vendor/encode_unicode/benches/length.rs vendored Normal file
View File

@@ -0,0 +1,282 @@
/* Copyright 2018-2022 Torbjørn Birch Moltu
*
* Licensed under the Apache License, Version 2.0, <LICENSE-APACHE or
* http://apache.org/licenses/LICENSE-2.0> or the MIT license <LICENSE-MIT or
* http://opensource.org/licenses/MIT>, at your option. This file may not be
* copied, modified, or distributed except according to those terms.
*/
// Run with -- --nocapture to show error messages if setup fails.
// (or use ./do.sh)
#![cfg(feature="std")]
#![feature(test)]
extern crate test;
use test::{Bencher, black_box};
use std::fs;
use std::path::Path;
use std::io::ErrorKind;
use std::thread::sleep;
use std::time::Duration;
use std::collections::HashMap;
extern crate minreq;
#[macro_use] extern crate lazy_static;
extern crate encode_unicode;
use encode_unicode::{CharExt, Utf8Char, U8UtfExt, Utf16Char, U16UtfExt};
// Setup; need longish strings to make benchmarks representative and
// reduce overhead (might get cache misses now though)
// Therefore we download a few wikipedia articles in different languages.
// Downloading a fixed revision of the articles doesn't prevent the HTML from
// changing due to changes in templates or rendering.
fn load_wikipedia(language: &str, article: &str, english: &str, revision: usize) -> String {
let cache_path = Path::new("benches").join("texts");
let cache_path = cache_path.to_str().unwrap();
let name = format!("{}_{}.html", language, english);
let path = Path::new(cache_path).join(&name);
let path = path.to_str().unwrap();
match fs::read_to_string(path) {
Ok(content) => return content,
Err(ref e) if e.kind() == ErrorKind::NotFound => {},//continue
Err(ref e) if e.kind() == ErrorKind::InvalidData => {
panic!("{} exists but is not UTF-8", &name);
},
Err(e) => panic!("{} exists but cannot be read ({})", path, e),
}
let mut article_ascii = String::new();
for c in article.chars() {
if c.is_ascii() {
article_ascii.push(c);
} else {
let encoded = format!("%{:2X}", c as u32);
article_ascii.push_str(encoded.as_str());
}
}
let url = format!("https://{}.m.wikipedia.org/w/index.php?title={}&oldid={}",
language, article_ascii, revision
);
println!("Downloading {} and saving to {}", &url, path);
let response = minreq::get(&url).send().unwrap_or_else(|e| {
panic!("Cannot get {}: {}", url, e);
});
if response.status_code != 200 {
panic!("Bad URL {}: {} {}", url, response.status_code, response.reason_phrase);
}
let content = String::from_utf8(response.into_bytes()).unwrap_or_else(|_| {
panic!("Response from {} is not UTF-8", url);
});
if let Err(e) = fs::create_dir_all(cache_path) {
eprintln!("Warning: failed to create directory {}: {}", cache_path, e);
} else if let Err(e) = fs::write(&path, &content) {
eprintln!("Warning: failed to save {}: {}", path, e);
}
sleep(Duration::from_secs(1));
content
}
const ARTICLES: &[(&str, &str, &str, usize)] = &[
("en", "United_Kingdom", "United_Kingdom", 855522252),// 99,7% ASCII
("es", "España", "Spain", 109861222),// 1,75% 2-byte characters
("ru", "Россия", "Russia", 94607243),// 36% 2-byte characters
("zh", "中國", "China", 50868604),// 30% 3-byte characters
];
lazy_static!{
static ref STRINGS: HashMap<&'static str, String> = {
let mut content = HashMap::new();
for &(language, article, english, revision) in ARTICLES {
content.insert(language, load_wikipedia(language, article, english, revision));
}
// make one string with only ASCII
let only_ascii = content.values()
.map(|v| (v, v.bytes().filter(|b| b.is_ascii() ).count()) )
.max_by_key(|&(_,len)| len )
.map(|(v,_)| v.bytes().filter(|b| b.is_ascii() ).map(|b| b as char ).collect() )
.unwrap();
content.insert("ascii", only_ascii);
content
};
static ref EQUAL_CHARS: HashMap<&'static str, &'static str> = {
let (least, chars) = STRINGS.iter()
.map(|(l,s)| (l, s.chars().count()) )
.min_by_key(|&(_,chars)| chars )
.unwrap();
println!("chars: {} (limited by {})", chars, least);
STRINGS.iter().map(|(&language, string)| {
let cut = string.char_indices()
.nth(chars)
.map_or(string.len(), |(i,_)| i );
let string = &string[..cut];
assert_eq!(string.chars().count(), chars);
(language, string)
}).collect()
};
static ref EQUAL_BYTES: HashMap<&'static str, String> = {
let (least, bytes) = STRINGS.iter()
.map(|(l,s)| (l, s.len()) )
.min_by_key(|&(_,bytes)| bytes )
.unwrap();
println!("bytes: {} (limited by {})", bytes, least);
STRINGS.iter().map(|(&language, string)| {
let mut remaining = bytes;
// take just so many characters that their length is exactly $bytes
// slicing won't if !string.is_char_boundary(bytes),
let string = string.chars().filter(|c| {
match remaining.checked_sub(c.len_utf8()) {
Some(after) => {remaining = after; true},
None => false
}
}).collect::<String>();
assert_eq!(string.len(), bytes);
(language, string)
}).collect()
};
static ref EQUAL_UNITS: HashMap<&'static str, String> = {
let (least, units) = STRINGS.iter()
.map(|(l,s)| (l, s.chars().map(|c| c.len_utf16() ).sum::<usize>()) )
.min_by_key(|&(_,units)| units )
.unwrap();
println!("units: {} (limited by {})", units, least);
STRINGS.iter().map(|(&language, string)| {
let mut remaining = units;
let string = string.chars().filter(|c| {
match remaining.checked_sub(c.len_utf16()) {
Some(after) => {remaining = after; true},
None => false
}
}).collect::<String>();
assert_eq!(string.chars().map(|c| c.len_utf16() ).sum::<usize>(), units);
(language, string)
}).collect()
};
}
///////////////////////////
// benchmarks begin here //
///////////////////////////
fn utf8char_len(language: &str, b: &mut Bencher) {
let string = &EQUAL_BYTES[language];
let chars: Vec<Utf8Char> = string.chars().map(|c| c.to_utf8() ).collect();
let bytes = string.len();
b.iter(|| {
let sum: usize = black_box(&chars).iter().map(|u8c| u8c.len() ).sum();
assert_eq!(sum, bytes);
});
}
#[bench] fn utf8char_len_ascii(b: &mut Bencher) {utf8char_len("ascii", b)}
#[bench] fn utf8char_len_en(b: &mut Bencher) {utf8char_len("en", b)}
#[bench] fn utf8char_len_es(b: &mut Bencher) {utf8char_len("es", b)}
#[bench] fn utf8char_len_ru(b: &mut Bencher) {utf8char_len("ru", b)}
#[bench] fn utf8char_len_zh(b: &mut Bencher) {utf8char_len("zh", b)}
fn utf8_extra_bytes_unchecked(language: &str, b: &mut Bencher) {
let string = &EQUAL_CHARS[language];
let chars = string.chars().count();
let string = string.as_bytes();
b.iter(|| {
let mut i = 0;
let mut loops = 0;
while i < string.len() {
i += string[i].extra_utf8_bytes_unchecked();
i += 1;
loops += 1;
}
assert_eq!(loops, chars);
});
}
#[bench] fn utf8_extra_bytes_unchecked_ascii(b: &mut Bencher) {utf8_extra_bytes_unchecked("ascii", b)}
#[bench] fn utf8_extra_bytes_unchecked_en(b: &mut Bencher) {utf8_extra_bytes_unchecked("en", b)}
#[bench] fn utf8_extra_bytes_unchecked_es(b: &mut Bencher) {utf8_extra_bytes_unchecked("es", b)}
#[bench] fn utf8_extra_bytes_unchecked_ru(b: &mut Bencher) {utf8_extra_bytes_unchecked("ru", b)}
#[bench] fn utf8_extra_bytes_unchecked_zh(b: &mut Bencher) {utf8_extra_bytes_unchecked("zh", b)}
fn utf8_extra_bytes(language: &str, b: &mut Bencher) {
let string = &EQUAL_CHARS[language];
let chars = string.chars().count();
let string = string.as_bytes();
b.iter(|| {
let mut i = 0;
let mut loops = 0;
let mut errors = 0;
while i < string.len() {
match string[i].extra_utf8_bytes() {
Ok(n) => i += n,
Err(_) => errors += 1,
}
i += 1;
loops += 1;
}
assert_eq!(loops, chars);
assert_eq!(errors, 0);
});
}
#[bench] fn utf8_extra_bytes_ascii(b: &mut Bencher) {utf8_extra_bytes("ascii", b)}
#[bench] fn utf8_extra_bytes_en(b: &mut Bencher) {utf8_extra_bytes("en", b)}
#[bench] fn utf8_extra_bytes_es(b: &mut Bencher) {utf8_extra_bytes("es", b)}
#[bench] fn utf8_extra_bytes_ru(b: &mut Bencher) {utf8_extra_bytes("ru", b)}
#[bench] fn utf8_extra_bytes_zh(b: &mut Bencher) {utf8_extra_bytes("zh", b)}
fn utf16char_len(language: &str, b: &mut Bencher) {
let string = &EQUAL_UNITS[language];
let chars: Vec<Utf16Char> = string.chars().map(|c| c.to_utf16() ).collect();
let units = string.chars().map(|c| c.len_utf16() ).sum::<usize>();
b.iter(|| {
let sum: usize = black_box(&chars).iter().map(|u8c| u8c.len() ).sum();
assert_eq!(sum, units);
});
}
#[bench] fn utf16char_len_ascii(b: &mut Bencher) {utf16char_len("ascii", b)}
#[bench] fn utf16char_len_en(b: &mut Bencher) {utf16char_len("en", b)}
#[bench] fn utf16char_len_es(b: &mut Bencher) {utf16char_len("en", b)}
#[bench] fn utf16char_len_ru(b: &mut Bencher) {utf16char_len("ru", b)}
#[bench] fn utf16char_len_zh(b: &mut Bencher) {utf16char_len("zh", b)}
fn utf16_is_leading_surrogate(language: &str, b: &mut Bencher) {
let string = &EQUAL_UNITS[language];
let chars = string.chars().count();
let string: Vec<u16> = string.chars().map(|c| c.to_utf16() ).collect();
b.iter(|| {
let mut i = 0;
let mut loops = 0;
while i < string.len() {
i += if string[i].is_utf16_leading_surrogate() {2} else {1};
loops += 1;
}
assert_eq!(loops, chars);
});
}
#[bench] fn utf16_is_leading_surrogate_ascii(b: &mut Bencher) {utf16_is_leading_surrogate("ascii", b)}
#[bench] fn utf16_is_leading_surrogate_en(b: &mut Bencher) {utf16_is_leading_surrogate("en", b)}
#[bench] fn utf16_is_leading_surrogate_es(b: &mut Bencher) {utf16_is_leading_surrogate("es", b)}
#[bench] fn utf16_is_leading_surrogate_ru(b: &mut Bencher) {utf16_is_leading_surrogate("ru", b)}
#[bench] fn utf16_is_leading_surrogate_zh(b: &mut Bencher) {utf16_is_leading_surrogate("zh", b)}
fn utf16_needs_extra_unit(language: &str, b: &mut Bencher) {
let string = &EQUAL_UNITS[language];
let chars = string.chars().count();
let string: Vec<u16> = string.chars().map(|c| c.to_utf16() ).collect();
b.iter(|| {
let mut i = 0;
let mut loops = 0;
let mut errors = 0;
while i < string.len() {
i += match string[i].utf16_needs_extra_unit() {
Ok(true) => 2,
Ok(false) => 1,
Err(_) => {errors+=1; 1}
};
loops += 1;
}
assert_eq!(loops, chars);
assert_eq!(errors, 0);
});
}
#[bench] fn utf16_needs_extra_unit_ascii(b: &mut Bencher) {utf16_needs_extra_unit("ascii", b)}
#[bench] fn utf16_needs_extra_unit_en(b: &mut Bencher) {utf16_needs_extra_unit("en", b)}
#[bench] fn utf16_needs_extra_unit_es(b: &mut Bencher) {utf16_needs_extra_unit("es", b)}
#[bench] fn utf16_needs_extra_unit_ru(b: &mut Bencher) {utf16_needs_extra_unit("ru", b)}
#[bench] fn utf16_needs_extra_unit_zh(b: &mut Bencher) {utf16_needs_extra_unit("zh", b)}

View File

@@ -0,0 +1,122 @@
/* Copyright 2018 Torbjørn Birch Moltu
*
* Licensed under the Apache License, Version 2.0, <LICENSE-APACHE or
* http://apache.org/licenses/LICENSE-2.0> or the MIT license <LICENSE-MIT or
* http://opensource.org/licenses/MIT>, at your option. This file may not be
* copied, modified, or distributed except according to those terms.
*/
// Run with -- --nocapture to show error messages if setup fails.
// (or use ./do.sh)
// uses /usr/share/dict/ for text to convert to Vec<Utf*Char> and iterate over
#![cfg(all(unix, feature="std"))]
#![feature(test)]
extern crate test;
use test::{Bencher, black_box};
#[macro_use] extern crate lazy_static;
extern crate encode_unicode;
use encode_unicode::{CharExt, Utf8Char, Utf16Char, IterExt};
fn read_or_exit(file: &str) -> String {
let mut fd = std::fs::File::open(file).unwrap_or_else(|err| {
if err.kind() == std::io::ErrorKind::NotFound {
eprintln!("{} not found, skipping benchmarks.", file);
std::process::exit(0);
} else {
eprintln!("Failed to open {}: {}.", file, err);
std::process::exit(1);
}
});
let mut content = String::new();
std::io::Read::read_to_string(&mut fd, &mut content).unwrap_or_else(|err| {
eprintln!("Failed to read {}: {}.", file, err);
std::process::exit(1);
});
content
}
lazy_static!{
// TODO find a big chinese file; `aptitude search '?provides(wordlist)'` didn't have one
static ref ENGLISH: String = read_or_exit("/usr/share/dict/american-english");
static ref UTF8CHARS: Vec<Utf8Char> = ENGLISH.chars().map(|c| c.to_utf8() ).collect();
static ref UTF16CHARS: Vec<Utf16Char> = ENGLISH.chars().map(|c| c.to_utf16() ).collect();
}
#[bench]
fn utf16_split_all_single_mulititerator(b: &mut Bencher) {
b.iter(|| {
black_box(&*UTF16CHARS).iter().to_units().for_each(|u| assert!(u != 0) );
});
}
#[bench]
fn utf16_split_all_single_flatmap(b: &mut Bencher) {
b.iter(|| {
black_box(&*UTF16CHARS).iter().cloned().flatten().for_each(|u| assert!(u != 0) );
});
}
#[bench]
fn utf16_split_all_single_cloned_flatten(b: &mut Bencher) {
b.iter(|| {
black_box(&*UTF16CHARS).iter().cloned().flatten().for_each(|u| assert!(u != 0) );
});
}
#[bench]
fn utf8_split_mostly_ascii_multiiterator(b: &mut Bencher) {
b.iter(|| {
black_box(&*UTF8CHARS).iter().to_bytes().for_each(|b| assert!(b != 0) );
});
}
#[bench]
fn utf8_split_mostly_ascii_flatmap(b: &mut Bencher) {
b.iter(|| {
black_box(&*UTF8CHARS).iter().cloned().flatten().for_each(|b| assert!(b != 0) );
});
}
#[bench]
fn utf8_split_mostly_ascii_cloned_flatten(b: &mut Bencher) {
b.iter(|| {
black_box(&*UTF8CHARS).iter().cloned().flatten().for_each(|b| assert!(b != 0) );
});
}
#[bench]
fn utf8_extend_mostly_ascii_multiiterator(b: &mut Bencher) {
b.iter(|| {
let vec: Vec<u8> = black_box(&*UTF8CHARS).iter().to_bytes().collect();
assert_eq!(black_box(vec).len(), ENGLISH.len());
});
}
#[bench]
fn utf8_extend_mostly_ascii_custom(b: &mut Bencher) {
b.iter(|| {
let vec: Vec<u8> = black_box(&*UTF8CHARS).iter().collect();
assert_eq!(black_box(vec).len(), ENGLISH.len());
});
}
#[bench]
fn utf8_extend_mostly_ascii_custom_str(b: &mut Bencher) {
b.iter(|| {
let vec: String = black_box(&*UTF8CHARS).iter().cloned().collect();
assert_eq!(black_box(vec).len(), ENGLISH.len());
});
}
#[bench]
fn utf16_extend_all_single_multiiterator(b: &mut Bencher) {
b.iter(|| {
let vec: Vec<u16> = black_box(&*UTF16CHARS).iter().to_units().collect();
assert!(black_box(vec).len() < ENGLISH.len());
});
}
#[bench]
fn utf16_extend_all_single_custom(b: &mut Bencher) {
b.iter(|| {
let vec: Vec<u16> = black_box(&*UTF16CHARS).iter().collect();
assert!(black_box(vec).len() < ENGLISH.len());
});
}

96
vendor/encode_unicode/do.sh vendored Executable file
View File

@@ -0,0 +1,96 @@
#!/usr/bin/env bash
set -e -o pipefail
MSRV=1.56.1
FUZZ_DURATION=60
FUZZ_PAUSE=2
if [[ ${1:0:1} == - || $1 == help ]] || (( $# > 1 )); then
echo "A script to make it easy to check & lint & test everything." >&2
echo "It assumes rustup is installed and that cargo +release works." >&2
echo >&2
echo "Usage: $0 ([setup|MSRV|check|test|ignored|clippy|miri|fuzz|bench|shellcheck|help])" >&2
echo "If no argument is provided, all parts except ignored and help are run," >&2
echo "but setup is only done if auto-detection fails." >&2
exit 1
fi
# should have been a Makefile
# core check, Minimum supported Rust version
if [[ $1 == setup ]] || ! rustup show | grep --silent "$MSRV"; then
rustup install "$MSRV" --no-self-update
fi
if [[ -z $1 || $1 == msrv ]]; then
# FIXME modify Cargo.toml like on CI, and then restore it and Cargo.lock afterwards
cargo "+$MSRV" build --all-features
fi
# check all feature combinations, stable
if [[ $1 == setup ]] || ! rustup show | grep --silent stable; then
rustup install stable --no-self-update
fi
if [[ -z $1 || $1 == check ]]; then
cargo +stable check --examples --tests --no-default-features
cargo +stable check --examples --tests --no-default-features --features std
cargo +stable check --examples --tests --no-default-features --features ascii
cargo +stable check --examples --tests --all-features
fi
# tests, stable
if [[ -z $1 || $1 == test ]]; then
cargo +stable test --all-features -- --quiet
elif [[ $1 == ignored ]]; then
cargo +stable test --all-features -- --quiet --ignored
fi
# clippy, nightly
if [[ $1 == setup ]] || ! rustup show | grep --silent nightly; then
rustup install nightly --no-self-update
fi
if [[ $1 == setup ]] || ! cargo +nightly help clippy >/dev/null 2>/dev/null; then
rustup component add clippy --toolchain nightly
fi
if [[ -z $1 || $1 == clippy ]]; then
cargo +nightly clippy --all-features --tests --benches --examples
fi
# miri, nightly
if [[ $1 == setup ]] || ! cargo +nightly help miri >/dev/null 2>/dev/null; then
rustup component add miri --toolchain nightly
cargo +nightly miri setup
fi
if [[ -z $1 || $1 == miri ]]; then
cargo +nightly miri test --all-features -- --quiet
fi
# fuzzing tests, nightly
if [[ $1 == setup ]] || ! command -V cargo-fuzz >/dev/null 2>/dev/null; then
cargo +nightly install cargo-fuzz
fi
if [[ -z $1 || $1 == fuzz ]]; then
cargo +nightly fuzz build
for fuzztest in $(cargo +nightly fuzz list); do
sleep "$FUZZ_PAUSE"
echo "Fuzzing $fuzztest"
timeout "$FUZZ_DURATION" \
cargo +nightly fuzz run "$fuzztest" \
|| true
echo
done
fi
# benchmarks, nightly
if [[ -z $1 || $1 == bench ]]; then
cargo +nightly check --benches --no-default-features
cargo +nightly check --benches --no-default-features --features std
cargo +nightly check --benches --no-default-features --features ascii
cargo +nightly check --benches --all-features
# need nocapture to not hide error if setup fails
cargo +nightly bench --all-features -- --nocapture
fi
if [[ $1 == shellcheck || $1 == selfcheck ]] \
|| ([[ -z $1 ]] && command -V shellcheck >/dev/null 2>/dev/null); then
shellcheck "$0"
fi

View File

@@ -0,0 +1,89 @@
/* Copyright 2018 Torbjørn Birch Moltu
*
* Licensed under the Apache License, Version 2.0, <LICENSE-APACHE or
* http://apache.org/licenses/LICENSE-2.0> or the MIT license <LICENSE-MIT or
* http://opensource.org/licenses/MIT>, at your option. This file may not be
* copied, modified, or distributed except according to those terms.
*/
//! Counts the number of codepoints of each UTF-8 length in files
use std::env::args_os;
use std::fs::File;
use std::io::{self, Read, stdin};
use std::borrow::Cow;
extern crate encode_unicode;
use encode_unicode::U8UtfExt;
#[derive(Default)]
struct Distribution {
bytes: usize,
utf8: [usize; 4],
}
fn read(file: &mut dyn Read) -> (Distribution, Option<io::Error>) {
let mut r = Distribution::default();
let mut buf = [0u8; 4096];
loop {
let read = match file.read(&mut buf) {
Ok(0) => return (r, None),
Ok(n) => n,
Err(e) => return (r, Some(e)),
};
r.bytes += read;
for (o, &b) in buf[..read].iter().enumerate() {
if let Ok(i) = b.extra_utf8_bytes() {
r.utf8[i] += 1;
if i == 3 {
let min = o.saturating_sub(20);
let max = if o+23 <= read {o+23} else {read};
println!("{}", String::from_utf8_lossy(&buf[min..max]));
}
}
}
}
}
fn display(name_pad: usize, name: Cow<str>,
r: Distribution, err: Option<io::Error>) {
let c = r.utf8;
let characters = c[0]+c[1]+c[2]+c[3];
let s = [c[0], c[1]*2, c[2]*3, c[3]*4];
let p = [
(s[0]*100) as f32 / r.bytes as f32,
(s[1]*100) as f32 / r.bytes as f32,
(s[2]*100) as f32 / r.bytes as f32,
(s[3]*100) as f32 / r.bytes as f32,
];
println!("{:>6$}: bytes: {:7}, UTF-8 distribution: [{:7}, {:6}, {:6}, {:6}]",
name, r.bytes, s[0], s[1], s[2], s[3], name_pad
);
println!("{5:6$} chars: {:7}, UTF-8 percentages: [{:>6.2}%, {:>5.2}%, {:>5.2}%, {:>5.2}%]",
characters, p[0], p[1], p[2], p[3], "", name_pad
);
if let Some(err) = err {
println!("{1:2$} {}", err, "", name_pad);
}
}
fn main() {
let name_length = args_os().skip(1)
.map(|path| path.to_string_lossy().chars().count() )
.max();
for path in args_os().skip(1) {
let name = path.to_string_lossy();
let (r,err) = match File::open(&path) {
Ok(mut file) => read(&mut file),
Err(err) => {
eprintln!("{}:\t{}", name, err);
continue;
}
};
display(name_length.unwrap(), name, r, err);
}
if name_length.is_none() {
let stdin = stdin();
let (r,err) = read(&mut stdin.lock());
display(0, Cow::Borrowed("stdin"), r, err);
}
}

View File

@@ -0,0 +1,490 @@
/* Copyright 2018-2020 Torbjørn Birch Moltu
*
* Licensed under the Apache License, Version 2.0, <LICENSE-APACHE or
* http://apache.org/licenses/LICENSE-2.0> or the MIT license <LICENSE-MIT or
* http://opensource.org/licenses/MIT>, at your option. This file may not be
* copied, modified, or distributed except according to those terms.
*/
//! Iterators that turn multiple `u8`s or `u16`s into `Utf*Char`s, but can fail.
//!
//! To be predictable, all errors consume one element each.
//!
//! The iterator adaptors produce neither offset nor element length to work
//! well with other adaptors,
//! while the slice iterators yield both to make more advanced use cases easy.
use crate::errors::{Utf16FirstUnitError, Utf16PairError, Utf8Error};
use crate::errors::Utf16SliceError::*;
use crate::errors::Utf16PairError::*;
use crate::errors::Utf8ErrorKind::*;
use crate::utf8_char::Utf8Char;
use crate::utf16_char::Utf16Char;
use crate::traits::U16UtfExt;
extern crate core;
use core::borrow::Borrow;
use core::fmt::{self, Debug};
use core::iter::Chain;
use core::option;
/// Decodes UTF-8 characters from a byte iterator into `Utf8Char`s.
///
/// See [`IterExt::to_utf8chars()`](../trait.IterExt.html#tymethod.to_utf8chars)
/// for examples and error handling.
#[derive(Clone, Default)]
pub struct Utf8CharMerger<B:Borrow<u8>, I:Iterator<Item=B>> {
iter: I,
/// number of bytes that were read before an error was detected
after_err_leftover: u8,
/// stack because it simplifies popping.
after_err_stack: [u8; 3],
}
impl<B:Borrow<u8>, I:Iterator<Item=B>, T:IntoIterator<IntoIter=I,Item=B>>
From<T> for Utf8CharMerger<B, I> {
fn from(t: T) -> Self {
Utf8CharMerger {
iter: t.into_iter(),
after_err_leftover: 0,
after_err_stack: [0; 3],
}
}
}
impl<B:Borrow<u8>, I:Iterator<Item=B>> Utf8CharMerger<B,I> {
/// Extract the inner iterator.
///
/// If the last item produced by `.next()` was an `Err`,
/// up to three following bytes might be missing.
/// The exact number of missing bytes for each error type should not be relied on.
///
/// # Examples
///
/// Three bytes swallowed:
/// ```
/// # use encode_unicode::IterExt;
/// let mut merger = b"\xf4\xa1\xb2FS".iter().to_utf8chars();
/// assert!(merger.next().unwrap().is_err());
/// let mut inner: std::slice::Iter<u8> = merger.into_inner();
/// assert_eq!(inner.next(), Some(&b'S')); // b'\xa1', b'\xb2' and b'F' disappeared
/// ```
///
/// All bytes present:
/// ```
/// # use encode_unicode::IterExt;
/// let mut merger = b"\xb0FS".iter().to_utf8chars();
/// assert!(merger.next().unwrap().is_err());
/// assert_eq!(merger.into_inner().next(), Some(&b'F'));
/// ```
///
/// Two bytes missing:
/// ```
/// # use encode_unicode::IterExt;
/// let mut merger = b"\xe0\x80\x80FS".iter().to_utf8chars();
/// assert!(merger.next().unwrap().is_err());
/// assert_eq!(merger.into_inner().next(), Some(&b'F'));
/// ```
pub fn into_inner(self) -> I {
self.iter
}
fn save(&mut self, bytes: &[u8;4], len: usize) {
// forget bytes[0] and push the others onto self.after_err_stack (in reverse).
for &after_err in bytes[1..len].iter().rev() {
self.after_err_stack[self.after_err_leftover as usize] = after_err;
self.after_err_leftover += 1;
}
}
/// Reads len-1 bytes into bytes[1..]
fn extra(&mut self, bytes: &mut[u8;4], len: usize) -> Result<(),Utf8Error> {
// This is the only function that pushes onto after_err_stack,
// and it checks that all bytes are continuation bytes before fetching the next one.
// Therefore only the last byte retrieved can be a non-continuation byte.
// That last byte is also the last to be retrieved from after_err.
//
// Before this function is called, there has been retrieved at least one byte.
// If that byte was a continuation byte, next() produces an error
// and won't call this function.
// Therefore, we know that after_err is empty at this point.
// This means that we can use self.iter directly, and knows where to start pushing
debug_assert_eq!(self.after_err_leftover, 0, "first: {:#02x}, stack: {:?}", bytes[0], self.after_err_stack);
for i in 1..len {
if let Some(extra) = self.iter.next() {
let extra = *extra.borrow();
bytes[i] = extra;
if extra & 0b1100_0000 != 0b1000_0000 {
// not a continuation byte
self.save(bytes, i+1);
return Err(Utf8Error{ kind: InterruptedSequence })
}
} else {
self.save(bytes, i);
return Err(Utf8Error{ kind: TooFewBytes });
}
}
Ok(())
}
}
impl<B:Borrow<u8>, I:Iterator<Item=B>> Iterator for Utf8CharMerger<B,I> {
type Item = Result<Utf8Char,Utf8Error>;
fn next(&mut self) -> Option<Self::Item> {
let first: u8;
if self.after_err_leftover != 0 {
self.after_err_leftover -= 1;
first = self.after_err_stack[self.after_err_leftover as usize];
} else if let Some(next) = self.iter.next() {
first = *next.borrow();
} else {
return None;
}
unsafe {
let mut bytes = [first, 0, 0, 0];
let ok = match first {
0b0000_0000..=0b0111_1111 => {/*1 and */Ok(())},
0b1100_0010..=0b1101_1111 => {//2 and not overlong
self.extra(&mut bytes, 2) // no extra validation required
},
0b1110_0000..=0b1110_1111 => {//3
if let Err(e) = self.extra(&mut bytes, 3) {
Err(e)
} else if bytes[0] == 0b1110_0000 && bytes[1] <= 0b10_011111 {
self.save(&bytes, 3);
Err(Utf8Error{ kind: OverlongEncoding })
} else if bytes[0] == 0b1110_1101 && bytes[1] & 0b11_100000 == 0b10_100000 {
self.save(&bytes, 3);
Err(Utf8Error{ kind: Utf16ReservedCodepoint })
} else {
Ok(())
}
},
0b1111_0000..=0b1111_0100 => {//4
if let Err(e) = self.extra(&mut bytes, 4) {
Err(e)
} else if bytes[0] == 0b11110_000 && bytes[1] <= 0b10_001111 {
self.save(&bytes, 4);
Err(Utf8Error{ kind: OverlongEncoding })
} else if bytes[0] == 0b11110_100 && bytes[1] > 0b10_001111 {
self.save(&bytes, 4);
Err(Utf8Error{ kind: TooHighCodepoint })
} else {
Ok(())
}
},
0b1000_0000..=0b1011_1111 => {// continuation byte
Err(Utf8Error{ kind: UnexpectedContinuationByte })
},
0b1100_0000..=0b1100_0001 => {// 2 and overlong
Err(Utf8Error{ kind: NonUtf8Byte })
},
0b1111_0101..=0b1111_0111 => {// 4 and too high codepoint
Err(Utf8Error{ kind: NonUtf8Byte })
},
0b1111_1000..=0b1111_1111 => {
Err(Utf8Error{ kind: NonUtf8Byte })
},
};
Some(ok.map(|()| Utf8Char::from_array_unchecked(bytes) ))
}
}
fn size_hint(&self) -> (usize,Option<usize>) {
let (iter_min, iter_max) = self.iter.size_hint();
// cannot be exact, so KISS
let min = iter_min / 4; // don't bother rounding up or accounting for after_err
// handle edge case of max > usize::MAX-3 just in case.
// Using wrapping_add() wouldn't violate any API contract as the trait isn't unsafe.
let max = iter_max.and_then(|max| {
max.checked_add(self.after_err_leftover as usize)
});
(min, max)
}
}
impl<B:Borrow<u8>, I:Iterator<Item=B>+Debug> Debug for Utf8CharMerger<B,I> {
fn fmt(&self, fmtr: &mut fmt::Formatter) -> fmt::Result {
let mut in_order = [0u8; 3];
for i in 0..self.after_err_leftover as usize {
in_order[i] = self.after_err_stack[self.after_err_leftover as usize - i - 1];
}
fmtr.debug_struct("Utf8CharMerger")
.field("buffered", &&in_order[..self.after_err_leftover as usize])
.field("inner", &self.iter)
.finish()
}
}
/// An [`Utf8CharMerger`](struct.Utf8CharMerger.html) that also produces
/// offsets and lengths, but can only iterate over slices.
///
/// See [`SliceExt::utf8char_indices()`](../trait.SliceExt.html#tymethod.utf8char_indices)
/// for examples and error handling.
#[derive(Clone, Default)]
pub struct Utf8CharDecoder<'a> {
slice: &'a[u8],
index: usize,
}
impl<'a> From<&'a[u8]> for Utf8CharDecoder<'a> {
fn from(s: &[u8]) -> Utf8CharDecoder {
Utf8CharDecoder { slice: s, index: 0 }
}
}
impl<'a> Utf8CharDecoder<'a> {
/// Extract the remainder of the source slice.
///
/// # Examples
///
/// Unlike `Utf8CharMerger::into_inner()`, bytes directly after an error
/// are never swallowed:
/// ```
/// # use encode_unicode::SliceExt;
/// let mut iter = b"\xf4\xa1\xb2FS".utf8char_indices();
/// assert!(iter.next().unwrap().1.is_err());
/// assert_eq!(iter.as_slice(), b"\xa1\xb2FS");
/// ```
pub fn as_slice(&self) -> &'a[u8] {
&self.slice[self.index..]
}
}
impl<'a> Iterator for Utf8CharDecoder<'a> {
type Item = (usize, Result<Utf8Char,Utf8Error>, usize);
fn next(&mut self) -> Option<Self::Item> {
let start = self.index;
match Utf8Char::from_slice_start(&self.slice[self.index..]) {
Ok((u8c, len)) => {
self.index += len;
Some((start, Ok(u8c), len))
},
Err(_) if self.slice.len() <= self.index => None,
Err(e) => {
self.index += 1;
Some((start, Err(e), 1))
}
}
}
#[inline]
fn size_hint(&self) -> (usize,Option<usize>) {
let bytes = self.slice.len() - self.index;
// Cannot be exact, so KISS and don't bother rounding up.
// The slice is unlikely be full of 4-byte codepoints, so buffers
// allocated with the lower bound will have to be grown anyway.
(bytes/4, Some(bytes))
}
}
impl<'a> DoubleEndedIterator for Utf8CharDecoder<'a> {
fn next_back(&mut self) -> Option<Self::Item> {
if self.index < self.slice.len() {
let extras = self.slice.iter()
.rev()
.take_while(|&b| b & 0b1100_0000 == 0b1000_0000 )
.count();
let starts = self.slice.len() - (extras+1);
match Utf8Char::from_slice_start(&self.slice[starts..]) {
Ok((u8c,len)) if len == 1+extras => {
self.slice = &self.slice[..starts];
Some((starts, Ok(u8c), len))
},
// This enures errors for every byte in both directions,
// but means overlong and codepoint errors will be turned into
// tooshort errors.
Err(e) if extras == 0 => {
self.slice = &self.slice[..self.slice.len()-1];
Some((self.slice.len()-1, Err(e), 1))
},
_ => {
self.slice = &self.slice[..self.slice.len()-1];
Some((self.slice.len()-1, Err(Utf8Error{ kind: UnexpectedContinuationByte }), 1))
},
}
} else {
None
}
}
}
impl<'a> Debug for Utf8CharDecoder<'a> {
fn fmt(&self, fmtr: &mut fmt::Formatter) -> fmt::Result {
write!(fmtr, "Utf8CharDecoder {{ bytes[{}..]: {:?} }}", self.index, self.as_slice())
}
}
/// Decodes UTF-16 characters from a `u16` iterator into `Utf16Char`s.
///
/// See [`IterExt::to_utf16chars()`](../trait.IterExt.html#tymethod.to_utf16chars)
/// for examples and error handling.
#[derive(Clone, Default)]
pub struct Utf16CharMerger<B:Borrow<u16>, I:Iterator<Item=B>> {
iter: I,
/// Used when a trailing surrogate was expected, the u16 can be any value.
prev: Option<B>,
}
impl<B:Borrow<u16>, I:Iterator<Item=B>, T:IntoIterator<IntoIter=I,Item=B>>
From<T> for Utf16CharMerger<B,I> {
fn from(t: T) -> Self {
Utf16CharMerger { iter: t.into_iter(), prev: None }
}
}
impl<B:Borrow<u16>, I:Iterator<Item=B>> Utf16CharMerger<B,I> {
/// Extract the inner iterator.
///
/// If the last item produced was an `Err`, the first unit might be missing.
///
/// # Examples
///
/// Unit right after an error missing
/// ```
/// # use encode_unicode::IterExt;
/// # use encode_unicode::error::Utf16PairError;
/// let mut merger = [0xd901, 'F' as u16, 'S' as u16].iter().to_utf16chars();
/// assert_eq!(merger.next(), Some(Err(Utf16PairError::UnmatchedLeadingSurrogate)));
/// let mut inner: std::slice::Iter<u16> = merger.into_inner();
/// assert_eq!(inner.next(), Some('S' as u16).as_ref()); // 'F' was consumed by Utf16CharMerger
/// ```
///
/// Error that doesn't swallow any units
/// ```
/// # use encode_unicode::IterExt;
/// # use encode_unicode::error::Utf16PairError;
/// let mut merger = [0xde00, 'F' as u16, 'S' as u16].iter().to_utf16chars();
/// assert_eq!(merger.next(), Some(Err(Utf16PairError::UnexpectedTrailingSurrogate)));
/// let mut inner: std::slice::Iter<u16> = merger.into_inner();
/// assert_eq!(inner.next(), Some('F' as u16).as_ref()); // not consumed
/// ```
pub fn into_inner(self) -> I {
self.iter
}
/// Returns an iterator over the remaining units.
/// Unlike `into_inner()` this will never drop any units.
///
/// The exact type of the returned iterator should not be depended on.
///
/// # Examples
///
/// ```
/// # use encode_unicode::IterExt;
/// # use encode_unicode::error::Utf16PairError;
/// let slice = [0xd901, 'F' as u16, 'S' as u16];
/// let mut merger = slice.iter().to_utf16chars();
/// assert_eq!(merger.next(), Some(Err(Utf16PairError::UnmatchedLeadingSurrogate)));
/// let mut remaining = merger.into_remaining_units();
/// assert_eq!(remaining.next(), Some('F' as u16).as_ref());
/// ```
pub fn into_remaining_units(self) -> Chain<option::IntoIter<B>,I> {
self.prev.into_iter().chain(self.iter)
}
}
impl<B:Borrow<u16>, I:Iterator<Item=B>> Iterator for Utf16CharMerger<B,I> {
type Item = Result<Utf16Char,Utf16PairError>;
fn next(&mut self) -> Option<Self::Item> {
let first = self.prev.take().or_else(|| self.iter.next() );
first.map(|first| unsafe {
match first.borrow().utf16_needs_extra_unit() {
Ok(false) => Ok(Utf16Char::from_array_unchecked([*first.borrow(), 0])),
Ok(true) => match self.iter.next() {
Some(second) => match second.borrow().utf16_needs_extra_unit() {
Err(Utf16FirstUnitError) => Ok(Utf16Char::from_tuple_unchecked((
*first.borrow(),
Some(*second.borrow())
))),
Ok(_) => {
self.prev = Some(second);
Err(Utf16PairError::UnmatchedLeadingSurrogate)
}
},
None => Err(Utf16PairError::Incomplete)
},
Err(Utf16FirstUnitError) => Err(Utf16PairError::UnexpectedTrailingSurrogate),
}
})
}
fn size_hint(&self) -> (usize,Option<usize>) {
let (iter_min, iter_max) = self.iter.size_hint();
// cannot be exact, so KISS
let min = iter_min / 2; // don't bother rounding up or accounting for self.prev
let max = match (iter_max, &self.prev) {
(Some(max), &Some(_)) => max.checked_add(1),
(max, _) => max,
};
(min, max)
}
}
impl<B:Borrow<u16>, I:Iterator<Item=B>+Debug> Debug for Utf16CharMerger<B,I> {
fn fmt(&self, fmtr: &mut fmt::Formatter) -> fmt::Result {
fmtr.debug_struct("Utf16CharMerger")
.field("buffered", &self.prev.as_ref().map(|b| *b.borrow() ))
.field("inner", &self.iter)
.finish()
}
}
/// An [`Utf16CharMerger`](struct.Utf16CharMerger.html) that also produces
/// offsets and lengths, but can only iterate over slices.
///
/// See [`SliceExt::utf16char_indices()`](../trait.SliceExt.html#tymethod.utf16char_indices)
/// for examples and error handling.
#[derive(Clone, Default)]
pub struct Utf16CharDecoder<'a> {
slice: &'a[u16],
index: usize,
}
impl<'a> From<&'a[u16]> for Utf16CharDecoder<'a> {
fn from(s: &'a[u16]) -> Self {
Utf16CharDecoder{ slice: s, index: 0 }
}
}
impl<'a> Utf16CharDecoder<'a> {
/// Extract the remainder of the source slice.
///
/// # Examples
///
/// Unlike `Utf16CharMerger::into_inner()`, the unit after an error is never swallowed:
/// ```
/// # use encode_unicode::SliceExt;
/// # use encode_unicode::error::Utf16PairError;
/// let mut iter = [0xd901, 'F' as u16, 'S' as u16].utf16char_indices();
/// assert_eq!(iter.next(), Some((0, Err(Utf16PairError::UnmatchedLeadingSurrogate), 1)));
/// assert_eq!(iter.as_slice(), &['F' as u16, 'S' as u16]);
/// ```
pub fn as_slice(&self) -> &[u16] {
&self.slice[self.index..]
}
}
impl<'a> Iterator for Utf16CharDecoder<'a> {
type Item = (usize,Result<Utf16Char,Utf16PairError>,usize);
#[inline]
fn next(&mut self) -> Option<Self::Item> {
let start = self.index;
match Utf16Char::from_slice_start(self.as_slice()) {
Ok((u16c,len)) => {
self.index += len;
Some((start, Ok(u16c), len))
},
Err(EmptySlice) => None,
Err(FirstIsTrailingSurrogate) => {
self.index += 1;
Some((start, Err(UnexpectedTrailingSurrogate), 1))
},
Err(SecondIsNotTrailingSurrogate) => {
self.index += 1;
Some((start, Err(UnmatchedLeadingSurrogate), 1))
},
Err(MissingSecond) => {
self.index = self.slice.len();
Some((start, Err(Incomplete), 1))
}
}
}
#[inline]
fn size_hint(&self) -> (usize,Option<usize>) {
let units = self.slice.len() - self.index;
// Cannot be exact, so KISS and don't bother rounding up.
// The slice is unlikely be full of surrogate pairs, so buffers
// allocated with the lower bound will have to be grown anyway.
(units/2, Some(units))
}
}
impl<'a> Debug for Utf16CharDecoder<'a> {
fn fmt(&self, fmtr: &mut fmt::Formatter) -> fmt::Result {
write!(fmtr, "Utf16CharDecoder {{ units[{}..]: {:?} }}", self.index, self.as_slice())
}
}

309
vendor/encode_unicode/src/errors.rs vendored Normal file
View File

@@ -0,0 +1,309 @@
/* Copyright 2016-2022 Torbjørn Birch Moltu
*
* Licensed under the Apache License, Version 2.0, <LICENSE-APACHE or
* http://apache.org/licenses/LICENSE-2.0> or the MIT license <LICENSE-MIT or
* http://opensource.org/licenses/MIT>, at your option. This file may not be
* copied, modified, or distributed except according to those terms.
*/
//! Boilerplate-y error types.
//!
//! The discriminant values of the enums might change in minor releases.
//! (to reduce the size of the `Result<>` types they are returned in)
extern crate core;
use core::fmt::{self,Display,Formatter};
use core::ops::RangeInclusive;
#[cfg(feature="std")]
use std::error::Error;
macro_rules! description {($err:ty, $desc:expr) => {
#[cfg(not(feature="std"))]
impl $err {
#[allow(missing_docs)]
pub fn description(&self) -> &'static str {
($desc)(self)
}
}
#[cfg(feature="std")]
impl Error for $err {
fn description(&self) -> &'static str {
($desc)(self)
}
}
impl Display for $err {
fn fmt(&self, fmtr: &mut Formatter) -> fmt::Result {
#![allow(deprecated)] // calling our own function
write!(fmtr, "{}", self.description())
}
}
}}
macro_rules! single_cause {($(#[$doc:meta])* $err:ident => $desc:expr) => {
$(#[$doc])*
#[derive(Clone,Copy, Debug, PartialEq,Eq)]
pub struct $err;
description!{$err, |_| $desc }
}}
single_cause!{
/// Error returned by [`U16UtfExt::utf16_needs_extra_unit()`](../trait.U16UtfExt.html#tymethod.utf16_needs_extra_unit)
/// when called on an `u16` that's a trailing surrogate.
Utf16FirstUnitError => "is a trailing surrogate"
}
single_cause!{
/// Error returned by [`Utf8Char::from_ascii()`](../struct.Utf8Char.html#method.from_ascii)
/// for bytes that are not ASCII characters.
NonAsciiError => "not an ASCII character"
}
single_cause!{
/// Error returned by [`Utf16Char::from_bmp()`](../struct.Utf16Char.html#method.from_bmp)
/// for units that are not a standalone codepoint.
NonBmpError => "not a codepoint in the basic multilingual plane"
}
single_cause!{
/// Error returned by [`Utf8Char::from_str_start()`](../struct.Utf8Char.html#method.from_str_start)
/// and [`Utf16Char::from_str_start()`](../struct.Utf16Char.html#method.from_str_start)
/// when called with an empty string.
EmptyStrError => "is empty"
}
macro_rules! simple {($(#[$tydoc:meta])* $err:ident {
$( $(#[$vardoc:meta])* $variant:ident => $string:expr, )+
} ) => {
$(#[$tydoc])*
#[derive(Clone,Copy, Debug, PartialEq,Eq)]
pub enum $err {
$( $(#[$vardoc])* $variant, )*
}
description!{$err, |e: &$err| match *e {$($err::$variant => $string),*} }
}}
simple!{
/// Error returned when an `u32` is not a valid unicode codepoint.
CodepointError {
/// It's reserved for UTF-16 surrogate pairs.
Utf16Reserved => "is reserved for UTF-16 surrogate pairs",
/// It's higher than the highest codepoint (which is 0x10ffff).
TooHigh => "is higher than the highest codepoint",
}}
use CodepointError::*;
impl CodepointError {
/// Get the range of values for which this error would be given.
pub const fn error_range(self) -> RangeInclusive<u32> {match self {
Utf16Reserved => 0xd8_00..=0xdf_ff,
TooHigh => 0x00_10_ff_ff..=0xff_ff_ff_ff,
}}
}
simple!{
/// Error returned when an `[u16; 2]` doesn't form a valid UTF-16 codepoint.
Utf16ArrayError {
/// The first element is a trailing / low surrogate, which is never valid.
FirstIsTrailingSurrogate => "the first element is a trailing surrogate",
/// The second element is needed, but is not a trailing surrogate.
SecondIsNotTrailingSurrogate => "the second element is needed but is not a trailing surrogate",
}}
simple!{
/// Error returned when one or two `u16`s are not valid UTF-16.
///
/// They are returned in sinking precedence;
/// The condition that causes the first variant to be returned is checked
/// for before the condition the next variant is returned for.
Utf16TupleError {
/// The first unit is a trailing / low surrogate, which is never valid.
FirstIsTrailingSurrogate => "the first unit is a trailing surrogate",
/// The provided second unit is not necessary.
SuperfluousSecond => "the second unit is superfluous",
/// The first and only unit requires a second unit.
MissingSecond => "the first unit requires a second unit",
/// The second unit is needed and was provided, but is not a trailing surrogate.
SecondIsNotTrailingSurrogate => "the required second unit is not a trailing surrogate",
}}
simple!{
/// Error returned when a slice of `u16`s doesn't start with valid UTF-16.
Utf16SliceError {
/// The slice is empty.
EmptySlice => "the slice is empty",
/// The first unit is a trailing surrogate.
FirstIsTrailingSurrogate => "the first unit is a trailing surrogate",
/// The first and only unit requires a second unit.
MissingSecond => "the first and only unit requires a second one",
/// The first unit requires a second one, but it's not a trailing surrogate.
SecondIsNotTrailingSurrogate => "the required second unit is not a trailing surrogate",
}}
simple!{
/// Error returned by [`Utf16CharDecoder`](../iterator/struct.Utf16CharMerger.html#impl-Iterator)
/// when it encounters an invalid sequence.
Utf16PairError {
/// A trailing surrogate was not preceeded by a leading surrogate.
UnexpectedTrailingSurrogate => "a trailing surrogate was not preceeded by a leading surrogate",
/// A leading surrogate was followed by an unit that was not a trailing surrogate.
UnmatchedLeadingSurrogate => "a leading surrogate was followed by an unit that was not a trailing surrogate",
/// A trailing surrogate was expected when the end was reached.
Incomplete => "a trailing surrogate was expected when the end was reached",
}}
simple!{
/// Error returned when [`Utf8Char::from_str()`](../struct.Utf8Char.html#impl-FromStr)
/// or [`Utf16Char::from_str()`](../struct.Utf16Char.html#impl-FromStr) fails.
FromStrError {
/// `Utf8Char` and `Utf16Char` cannot store more than a single codepoint.
MultipleCodepoints => "contains more than one codepoint",
/// `Utf8Char` and `Utf16Char` cannot be empty.
Empty => "is empty",
}
}
/// Error returned when an invalid UTF-8 sequence is encountered.
///
/// See [`Utf8ErrorKind`](enum.Utf8ErrorKind.html) for the types of errors
/// that this type can be returned for.
#[derive(Clone,Copy, Debug, PartialEq,Eq)]
pub struct Utf8Error {
pub(crate) kind: Utf8ErrorKind,
}
impl Utf8Error {
/// Get the type of error.
pub const fn kind(&self) -> Utf8ErrorKind {
self.kind
}
#[cfg(not(feature="std"))]
#[allow(missing_docs)]
pub const fn description(&self) -> &'static str {
utf8_error_description(self.kind)
}
}
#[cfg(feature="std")]
impl Error for Utf8Error {
fn description(&self) -> &'static str {
utf8_error_description(self.kind)
}
}
impl Display for Utf8Error {
fn fmt(&self, fmtr: &mut Formatter) -> fmt::Result {
fmtr.write_str(utf8_error_description(self.kind))
}
}
/// The types of errors that can occur when decoding a UTF-8 codepoint.
///
/// The variants are more technical than what an end user is likely interested
/// in, but might be useful for deciding how to handle the error.
///
/// They can be grouped into three categories:
/// * Will happen regularly if decoding chunked or buffered text: `TooFewBytes`.
/// * Input might be binary, a different encoding or corrupted, `UnexpectedContinuationByte`
/// and `InterruptedSequence`.
/// (Broken UTF-8 sequence).
/// * Less likely to happen accidentaly and might be malicious:
/// `OverlongEncoding`, `Utf16ReservedCodepoint` and `TooHighCodepoint`.
/// Note that theese can still be caused by certain valid latin-1 strings
/// such as `"Á©"` (`b"\xC1\xA9"`).
#[derive(Clone,Copy, Debug, PartialEq,Eq)]
pub enum Utf8ErrorKind {
/// There are too few bytes to decode the codepoint.
///
/// This can happen when a slice is empty or too short, or an iterator
/// returned `None` while in the middle of a codepoint.
/// This error is never produced by functions accepting fixed-size
/// `[u8; 4]` arrays.
///
/// If decoding text coming chunked (such as in buffers passed to `Read`),
/// the remaing bytes should be carried over into the next chunk or buffer.
/// (including the byte this error was produced for.)
TooFewBytes,
/// A byte which is never used by well-formed UTF-8 was encountered.
///
/// This means that the input is using a different encoding,
/// is corrupted or binary.
///
/// This error is returned when a byte in the following ranges
/// is encountered anywhere in an UTF-8 sequence:
///
/// * `192` and `193` (`0b1100_000x`): Indicates an overlong encoding
/// of a single-byte, ASCII, character, and should therefore never occur.
/// * `248..` (`0b1111_1xxx`): Sequences cannot be longer than 4 bytes.
/// * `245..=247` (`0b1111_0101 | 0b1111_0110`): Indicates a too high
/// codepoint. (above `\u10ffff`)
NonUtf8Byte,
/// The first byte is not a valid start of a codepoint.
///
/// This might happen as a result of slicing into the middle of a codepoint,
/// the input not being UTF-8 encoded or being corrupted.
/// Errors of this type coming right after another error should probably
/// be ignored, unless returned more than three times in a row.
///
/// This error is returned when the first byte has a value in the range
/// `128..=191` (`0b1000_0000..=0b1011_1111`).
UnexpectedContinuationByte,
/// The byte at index 1..=3 should be a continuation byte,
/// but doesn't fit the pattern `0b10xx_xxxx`.
///
/// When the input slice or iterator has too few bytes,
/// [`TooFewBytes`](#Incomplete) is returned instead.
InterruptedSequence,
/// The encoding of the codepoint has so many leading zeroes that it
/// could be a byte shorter.
///
/// [Successfully decoding this can present a security issue](https://tools.ietf.org/html/rfc3629#section-10):
/// Doing so could allow an attacker to circumvent input validation that
/// only checks for ASCII characters, and input characters or strings that
/// would otherwise be rejected, such as `/../`.
///
/// This error is only returned for 3 and 4-byte encodings;
/// `NonUtf8Byte` is returned for bytes that start longer or shorter
/// overlong encodings.
OverlongEncoding,
/// The codepoint is reserved for UTF-16 surrogate pairs.
///
/// (`Utf8Char` cannot be used to work with the
/// [WTF-8](https://simonsapin.github.io/wtf-8) encoding for UCS-2 strings.)
///
/// This error is returned for codepoints in the range `\ud800`..=`\udfff`.
/// (which are three bytes long as UTF-8)
Utf16ReservedCodepoint,
/// The codepoint is higher than `\u10ffff`, which is the highest codepoint
/// unicode permits.
TooHighCodepoint,
}
const fn utf8_error_description(kind: Utf8ErrorKind) -> &'static str {
match kind {
Utf8ErrorKind::TooFewBytes => "too few bytes",
Utf8ErrorKind::NonUtf8Byte => "not UTF-8",
Utf8ErrorKind::UnexpectedContinuationByte => "not UTF-8",
Utf8ErrorKind::InterruptedSequence => "not UTF-8",
Utf8ErrorKind::OverlongEncoding => "malformed input",
Utf8ErrorKind::Utf16ReservedCodepoint => "malformed input",
Utf8ErrorKind::TooHighCodepoint => "invalid character",
}
}
impl PartialEq<Utf8ErrorKind> for Utf8Error {
fn eq(&self, kind: &Utf8ErrorKind) -> bool {
self.kind == *kind
}
}
impl PartialEq<Utf8Error> for Utf8ErrorKind {
fn eq(&self, error: &Utf8Error) -> bool {
*self == error.kind
}
}

89
vendor/encode_unicode/src/lib.rs vendored Normal file
View File

@@ -0,0 +1,89 @@
/* Copyright 2016-2022 Torbjørn Birch Moltu
* Copyright 2018 Aljoscha Meyer
*
* Licensed under the Apache License, Version 2.0, <LICENSE-APACHE or
* http://apache.org/licenses/LICENSE-2.0> or the MIT license <LICENSE-MIT or
* http://opensource.org/licenses/MIT>, at your option. This file may not be
* copied, modified, or distributed except according to those terms.
*/
/*!
Miscellaneous UTF-8 and UTF-16 types and methods.
# Optional features:
* `#![no_std]`-mode: There are a few differences:
* `Error` doesn't exist, but `description()` is made available as an inherent impl.
* `Extend`/`FromIterator`-implementations for `String`/`Vec<u8>`/`Vec<u16>` are missing.
* There is no `io`, so `Utf8Iterator` and `Utf8CharSplitter` doesn't implement `Read`.
This feature is enabled by setting `default-features=false` in `Cargo.toml`:
`encode_unicode = {version="0.3.4", default-features=false}`
* Integration with the [ascii](https://tomprogrammer.github.io/rust-ascii/ascii/index.html) crate:
Convert `Utf8Char` and `Utf16Char` to and from
[`ascii::AsciiChar`](https://tomprogrammer.github.io/rust-ascii/ascii/enum.AsciiChar.html).
# Minimum supported Rust version
The minimum supported Rust version for 1.0.\* releases is 1.56.
Later 1.y.0 releases might require newer Rust versions, but the three most
recent stable releases at the time of publishing will always be supported.
For example this means that if the current stable Rust version is 1.66 when
`encode_unicode` 1.1.0 is released, then `encode_unicode` 1.1.\* will
not require a newer Rust version than 1.63.
[crates.io page](https://crates.io/crates/encode_unicode)
[github repository](https://github.com/tormol/encode_unicode)
*/
#![cfg_attr(not(feature="std"), no_std)]
#![warn(missing_docs, unsafe_op_in_unsafe_fn)]
#![allow(
clippy::unusual_byte_groupings,// I sometimes group into UTF-8 control part and codepoint part
clippy::derive_hash_xor_eq,// tested
clippy::len_without_is_empty,// the character types are never empty
clippy::needless_return,// `foo.bar();\n foo` looks unfinished
clippy::redundant_closure_call,// not redundant in macros
clippy::cast_lossless,// the sizes are part of the struct name and so won't change
clippy::many_single_char_names,// the variables are in different scopes
clippy::cmp_owned,// smaller than pointer, and no allocations anyway
clippy::wrong_self_convention,// smaller than pointer
clippy::needless_range_loop,// the suggested iterator chains are less intuitive
clippy::identity_op,// applying a set of opereations with varying arguments to many elements looks nice
clippy::get_first,// .get(0), .get(1) is more readable
clippy::question_mark,// I prefer it very explicit
)]
#![warn(clippy::doc_markdown, clippy::manual_filter_map)]
// opt-in lints that might be interesting to recheck once in a while:
//#![warn(clippy::unwrap_used)]
mod errors;
mod traits;
mod utf8_char;
mod utf8_iterators;
mod utf16_char;
mod utf16_iterators;
mod decoding_iterators;
pub use traits::{CharExt, U8UtfExt, U16UtfExt, StrExt, IterExt, SliceExt};
pub use utf8_char::Utf8Char;
pub use utf16_char::Utf16Char;
pub mod error {// keeping the public interface in one file
//! Errors returned by various conversion methods in this crate.
pub use crate::errors::{FromStrError, EmptyStrError};
pub use crate::errors::{CodepointError, NonAsciiError, NonBmpError};
pub use crate::errors::{Utf8Error, Utf8ErrorKind};
pub use crate::errors::{Utf16SliceError, Utf16ArrayError, Utf16TupleError};
pub use crate::errors::{Utf16FirstUnitError, Utf16PairError};
}
pub mod iterator {
//! Iterator types that you should rarely need to name
pub use crate::utf8_iterators::{Utf8Iterator, Utf8CharSplitter, Utf8Chars, Utf8CharIndices};
pub use crate::utf16_iterators::{Utf16Iterator, Utf16CharSplitter, Utf16Chars, Utf16CharIndices};
pub use crate::decoding_iterators::{Utf8CharMerger, Utf8CharDecoder};
pub use crate::decoding_iterators::{Utf16CharMerger, Utf16CharDecoder};
}

1012
vendor/encode_unicode/src/traits.rs vendored Normal file

File diff suppressed because it is too large Load Diff

692
vendor/encode_unicode/src/utf16_char.rs vendored Normal file
View File

@@ -0,0 +1,692 @@
/* Copyright 2016-2022 Torbjørn Birch Moltu
*
* Licensed under the Apache License, Version 2.0, <LICENSE-APACHE or
* http://apache.org/licenses/LICENSE-2.0> or the MIT license <LICENSE-MIT or
* http://opensource.org/licenses/MIT>, at your option. This file may not be
* copied, modified, or distributed except according to those terms.
*/
use crate::utf16_iterators::Utf16Iterator;
use crate::traits::{CharExt, U16UtfExt};
use crate::utf8_char::Utf8Char;
use crate::errors::{Utf16SliceError, Utf16ArrayError, Utf16TupleError};
use crate::errors::{NonBmpError, EmptyStrError, FromStrError};
extern crate core;
use core::{hash,fmt};
use core::cmp::Ordering;
use core::borrow::Borrow;
use core::ops::Deref;
use core::str::FromStr;
#[cfg(feature="std")]
use core::iter::FromIterator;
#[cfg(feature="ascii")]
use core::char;
#[cfg(feature="ascii")]
extern crate ascii;
#[cfg(feature="ascii")]
use ascii::{AsciiChar,ToAsciiChar,ToAsciiCharError};
// I don't think there is any good default value for char, but char does.
#[derive(Default)]
// char doesn't do anything more advanced than u32 for Eq/Ord, so we shouldn't either.
// When it's a single unit, the second is zero, so Eq works.
// #[derive(Ord)] however, breaks on surrogate pairs.
#[derive(PartialEq,Eq)]
#[derive(Clone,Copy)]
/// An unicode codepoint stored as UTF-16.
///
/// It can be borrowed as an `u16` slice, and has the same size as `char`.
pub struct Utf16Char {
units: [u16; 2],
}
/////////////////////
//conversion traits//
/////////////////////
impl FromStr for Utf16Char {
type Err = FromStrError;
/// Create an `Utf16Char` from a string slice.
/// The string must contain exactly one codepoint.
///
/// # Examples
///
/// ```
/// use encode_unicode::error::FromStrError::*;
/// use encode_unicode::Utf16Char;
/// use std::str::FromStr;
///
/// assert_eq!(Utf16Char::from_str("a"), Ok(Utf16Char::from('a')));
/// assert_eq!(Utf16Char::from_str("🂠"), Ok(Utf16Char::from('🂠')));
/// assert_eq!(Utf16Char::from_str(""), Err(Empty));
/// assert_eq!(Utf16Char::from_str("ab"), Err(MultipleCodepoints));
/// assert_eq!(Utf16Char::from_str("é"), Err(MultipleCodepoints));// 'e'+u301 combining mark
/// ```
fn from_str(s: &str) -> Result<Self, FromStrError> {
match Utf16Char::from_str_start(s) {
Ok((u16c,bytes)) if bytes == s.len() => Ok(u16c),
Ok((_,_)) => Err(FromStrError::MultipleCodepoints),
Err(EmptyStrError) => Err(FromStrError::Empty),
}
}
}
impl From<char> for Utf16Char {
fn from(c: char) -> Self {
let (first, second) = c.to_utf16_tuple();
Utf16Char{ units: [first, second.unwrap_or(0)] }
}
}
impl From<Utf8Char> for Utf16Char {
fn from(utf8: Utf8Char) -> Utf16Char {
let (b, utf8_len) = utf8.to_array();
match utf8_len {
1 => Utf16Char{ units: [b[0] as u16, 0] },
4 => {// need surrogate
let mut first = 0xd800 - (0x01_00_00u32 >> 10) as u16;
first += (b[0] as u16 & 0x07) << 8;
first += (b[1] as u16 & 0x3f) << 2;
first += (b[2] as u16 & 0x30) >> 4;
let mut second = 0xdc00;
second |= (b[2] as u16 & 0x0f) << 6;
second |= b[3] as u16 & 0x3f;
Utf16Char{ units: [first, second] }
},
_ => { // 2 or 3
let mut unit = ((b[0] as u16 & 0x1f) << 6) | (b[1] as u16 & 0x3f);
if utf8_len == 3 {
unit = (unit << 6) | (b[2] as u16 & 0x3f);
}
Utf16Char{ units: [unit, 0] }
},
}
}
}
impl From<Utf16Char> for char {
fn from(uc: Utf16Char) -> char {
char::from_utf16_array_unchecked(uc.to_array())
}
}
impl IntoIterator for Utf16Char {
type Item=u16;
type IntoIter=Utf16Iterator;
/// Iterate over the units.
fn into_iter(self) -> Utf16Iterator {
Utf16Iterator::from(self)
}
}
#[cfg(feature="std")]
impl Extend<Utf16Char> for Vec<u16> {
fn extend<I:IntoIterator<Item=Utf16Char>>(&mut self, iter: I) {
let iter = iter.into_iter();
self.reserve(iter.size_hint().0);
for u16c in iter {
self.push(u16c.units[0]);
if u16c.units[1] != 0 {
self.push(u16c.units[1]);
}
}
}
}
#[cfg(feature="std")]
impl<'a> Extend<&'a Utf16Char> for Vec<u16> {
fn extend<I:IntoIterator<Item=&'a Utf16Char>>(&mut self, iter: I) {
self.extend(iter.into_iter().cloned())
}
}
#[cfg(feature="std")]
impl FromIterator<Utf16Char> for Vec<u16> {
fn from_iter<I:IntoIterator<Item=Utf16Char>>(iter: I) -> Self {
let mut vec = Vec::new();
vec.extend(iter);
return vec;
}
}
#[cfg(feature="std")]
impl<'a> FromIterator<&'a Utf16Char> for Vec<u16> {
fn from_iter<I:IntoIterator<Item=&'a Utf16Char>>(iter: I) -> Self {
Self::from_iter(iter.into_iter().cloned())
}
}
#[cfg(feature="std")]
impl Extend<Utf16Char> for String {
fn extend<I:IntoIterator<Item=Utf16Char>>(&mut self, iter: I) {
self.extend(iter.into_iter().map(Utf8Char::from));
}
}
#[cfg(feature="std")]
impl<'a> Extend<&'a Utf16Char> for String {
fn extend<I:IntoIterator<Item=&'a Utf16Char>>(&mut self, iter: I) {
self.extend(iter.into_iter().cloned());
}
}
#[cfg(feature="std")]
impl FromIterator<Utf16Char> for String {
fn from_iter<I:IntoIterator<Item=Utf16Char>>(iter: I) -> Self {
let mut s = String::new();
s.extend(iter);
return s;
}
}
#[cfg(feature="std")]
impl<'a> FromIterator<&'a Utf16Char> for String {
fn from_iter<I:IntoIterator<Item=&'a Utf16Char>>(iter: I) -> Self {
Self::from_iter(iter.into_iter().cloned())
}
}
/////////////////
//getter traits//
/////////////////
impl AsRef<[u16]> for Utf16Char {
#[inline]
fn as_ref(&self) -> &[u16] {
&self.units[..self.len()]
}
}
impl Borrow<[u16]> for Utf16Char {
#[inline]
fn borrow(&self) -> &[u16] {
self.as_ref()
}
}
impl Deref for Utf16Char {
type Target = [u16];
#[inline]
fn deref(&self) -> &[u16] {
self.as_ref()
}
}
////////////////
//ascii traits//
////////////////
#[cfg(feature="ascii")]
/// Requires the feature "ascii".
impl From<AsciiChar> for Utf16Char {
#[inline]
fn from(ac: AsciiChar) -> Self {
Utf16Char{ units: [ac.as_byte() as u16, 0] }
}
}
#[cfg(feature="ascii")]
/// Requires the feature "ascii".
impl ToAsciiChar for Utf16Char {
#[inline]
fn to_ascii_char(self) -> Result<AsciiChar, ToAsciiCharError> {
self.units[0].to_ascii_char()
}
#[inline]
unsafe fn to_ascii_char_unchecked(self) -> AsciiChar {
unsafe { self.units[0].to_ascii_char_unchecked() }
}
}
/////////////////////////////////////////////////////////
//Genaral traits that cannot be derived to emulate char//
/////////////////////////////////////////////////////////
impl hash::Hash for Utf16Char {
fn hash<H : hash::Hasher>(&self, state: &mut H) {
self.to_char().hash(state);
}
}
impl fmt::Debug for Utf16Char {
fn fmt(&self, fmtr: &mut fmt::Formatter) -> fmt::Result {
fmt::Debug::fmt(&self.to_char(), fmtr)
}
}
impl fmt::Display for Utf16Char {
fn fmt(&self, fmtr: &mut fmt::Formatter) -> fmt::Result {
fmt::Display::fmt(&Utf8Char::from(*self), fmtr)
}
}
// Cannot derive these impls because two-unit characters must always compare
// greater than one-unit ones.
impl PartialOrd for Utf16Char {
#[inline]
fn partial_cmp(&self, rhs: &Self) -> Option<Ordering> {
Some(self.cmp(rhs))
}
}
impl Ord for Utf16Char {
#[inline]
fn cmp(&self, rhs: &Self) -> Ordering {
// Shift the first unit by 0xd if surrogate, and 0 otherwise.
// This ensures surrogates are always greater than 0xffff, and
// that the second unit only affect the result when the first are equal.
// Multiplying by a constant factor isn't enough because that factor
// would have to be greater than 1023 and smaller than 5.5.
// This transformation is less complicated than combine_surrogates().
let lhs = (self.units[0] as u32, self.units[1] as u32);
let rhs = (rhs.units[0] as u32, rhs.units[1] as u32);
let lhs = (lhs.0 << (lhs.1 >> 12)) + lhs.1;
let rhs = (rhs.0 << (rhs.1 >> 12)) + rhs.1;
lhs.cmp(&rhs)
}
}
////////////////////////////////
//Comparisons with other types//
////////////////////////////////
impl PartialEq<char> for Utf16Char {
fn eq(&self, u32c: &char) -> bool {
*self == Utf16Char::from(*u32c)
}
}
impl PartialEq<Utf16Char> for char {
fn eq(&self, u16c: &Utf16Char) -> bool {
Utf16Char::from(*self) == *u16c
}
}
impl PartialOrd<char> for Utf16Char {
fn partial_cmp(&self, u32c: &char) -> Option<Ordering> {
self.partial_cmp(&Utf16Char::from(*u32c))
}
}
impl PartialOrd<Utf16Char> for char {
fn partial_cmp(&self, u16c: &Utf16Char) -> Option<Ordering> {
Utf16Char::from(*self).partial_cmp(u16c)
}
}
impl PartialEq<Utf8Char> for Utf16Char {
fn eq(&self, u8c: &Utf8Char) -> bool {
*self == Utf16Char::from(*u8c)
}
}
impl PartialOrd<Utf8Char> for Utf16Char {
fn partial_cmp(&self, u8c: &Utf8Char) -> Option<Ordering> {
self.partial_cmp(&Utf16Char::from(*u8c))
}
}
// The other direction is implemented in utf8_char.rs
/// Only considers the unit equal if the codepoint of the `Utf16Char` is not
/// made up of a surrogate pair.
///
/// There is no impl in the opposite direction, as this should only be used to
/// compare `Utf16Char`s against constants.
///
/// # Examples
///
/// ```
/// # use encode_unicode::Utf16Char;
/// assert!(Utf16Char::from('6') == b'6' as u16);
/// assert!(Utf16Char::from('\u{FFFF}') == 0xffff_u16);
/// assert!(Utf16Char::from_tuple((0xd876, Some(0xdef9))).unwrap() != 0xd876_u16);
/// ```
impl PartialEq<u16> for Utf16Char {
fn eq(&self, unit: &u16) -> bool {
self.units[0] == *unit && self.units[1] == 0
}
}
/// Only considers the byte equal if the codepoint of the `Utf16Char` is <= U+FF.
///
/// # Examples
///
/// ```
/// # use encode_unicode::Utf16Char;
/// assert!(Utf16Char::from('6') == b'6');
/// assert!(Utf16Char::from('\u{00FF}') == b'\xff');
/// assert!(Utf16Char::from('\u{0100}') != b'\0');
/// ```
impl PartialEq<u8> for Utf16Char {
fn eq(&self, byte: &u8) -> bool {
self.units[0] == *byte as u16
}
}
#[cfg(feature = "ascii")]
/// `Utf16Char`s that are not ASCII never compare equal.
impl PartialEq<AsciiChar> for Utf16Char {
#[inline]
fn eq(&self, ascii: &AsciiChar) -> bool {
self.units[0] == *ascii as u16
}
}
#[cfg(feature = "ascii")]
/// `Utf16Char`s that are not ASCII never compare equal.
impl PartialEq<Utf16Char> for AsciiChar {
#[inline]
fn eq(&self, u16c: &Utf16Char) -> bool {
*self as u16 == u16c.units[0]
}
}
#[cfg(feature = "ascii")]
/// `Utf16Char`s that are not ASCII always compare greater.
impl PartialOrd<AsciiChar> for Utf16Char {
#[inline]
fn partial_cmp(&self, ascii: &AsciiChar) -> Option<Ordering> {
self.units[0].partial_cmp(&(*ascii as u16))
}
}
#[cfg(feature = "ascii")]
/// `Utf16Char`s that are not ASCII always compare greater.
impl PartialOrd<Utf16Char> for AsciiChar {
#[inline]
fn partial_cmp(&self, u16c: &Utf16Char) -> Option<Ordering> {
(*self as u16).partial_cmp(&u16c.units[0])
}
}
///////////////////////////////////////////////////////
//pub impls that should be together for nicer rustdoc//
///////////////////////////////////////////////////////
impl Utf16Char {
/// A `const fn` alternative to the trait-based `Utf16Char::from(char)`.
///
/// # Examples
///
/// ```
/// # use encode_unicode::Utf16Char;
/// const REPLACEMENT_CHARACTER: Utf16Char = Utf16Char::new('\u{fffd}');
/// ```
pub const fn new(c: char) -> Self {
if c <= '\u{ffff}' {
Utf16Char{ units: [c as u16, 0] }
} else {
let c = (c as u32).wrapping_sub(0x01_00_00);
let first = 0xd8_00 | (c >> 10) as u16;
let second = 0xdc_00 | (c & 0x0_03_ff) as u16;
Utf16Char{ units: [first, second] }
}
}
/// Create an `Utf16Char` from the first codepoint in a string slice,
/// converting from UTF-8 to UTF-16.
///
/// The returned `usize` is the number of UTF-8 bytes used from the str,
/// and not the number of UTF-16 units.
///
/// Returns an error if the `str` is empty.
///
/// # Examples
///
/// ```
/// use encode_unicode::Utf16Char;
///
/// assert_eq!(Utf16Char::from_str_start("a"), Ok((Utf16Char::from('a'),1)));
/// assert_eq!(Utf16Char::from_str_start("ab"), Ok((Utf16Char::from('a'),1)));
/// assert_eq!(Utf16Char::from_str_start("🂠 "), Ok((Utf16Char::from('🂠'),4)));
/// assert_eq!(Utf16Char::from_str_start("é"), Ok((Utf16Char::from('e'),1)));// 'e'+u301 combining mark
/// assert!(Utf16Char::from_str_start("").is_err());
/// ```
pub const fn from_str_start(s: &str) -> Result<(Self,usize), EmptyStrError> {
if s.is_empty() {
return Err(EmptyStrError);
}
let b = s.as_bytes();
// Read the last byte first to reduce the number of unnecesary length checks.
match b[0] {
0..=127 => {// 1 byte => 1 unit
let unit = b[0] as u16;// 0b0000_0000_0xxx_xxxx
Ok((Utf16Char{ units: [unit, 0] }, 1))
},
0b1000_0000..=0b1101_1111 => {// 2 bytes => 1 unit
let unit = (((b[1] & 0x3f) as u16) << 0) // 0b0000_0000_00xx_xxxx
| (((b[0] & 0x1f) as u16) << 6);// 0b0000_0xxx_xx00_0000
Ok((Utf16Char{ units: [unit, 0] }, 2))
},
0b1110_0000..=0b1110_1111 => {// 3 bytes => 1 unit
let unit = (((b[2] & 0x3f) as u16) << 0) // 0b0000_0000_00xx_xxxx
| (((b[1] & 0x3f) as u16) << 6) // 0b0000_xxxx_xx00_0000
| (((b[0] & 0x0f) as u16) << 12);// 0bxxxx_0000_0000_0000
Ok((Utf16Char{ units: [unit, 0] }, 3))
},
_ => {// 4 bytes => 2 units
let second = 0xdc00 // 0b1101_1100_0000_0000
| (((b[3] & 0x3f) as u16) << 0) // 0b0000_0000_00xx_xxxx
| (((b[2] & 0x0f) as u16) << 6);// 0b0000_00xx_xx00_0000
let first = 0xd800-(0x01_00_00u32>>10) as u16// 0b1101_0111_1100_0000
+ (((b[2] & 0x30) as u16) >> 4) // 0b0000_0000_0000_00xx
+ (((b[1] & 0x3f) as u16) << 2) // 0b0000_0000_xxxx_xx00
+ (((b[0] & 0x07) as u16) << 8); // 0b0000_0xxx_0000_0000
Ok((Utf16Char{ units: [first, second] }, 4))
}
}
}
/// Validate and store the first UTF-16 codepoint in the slice.
/// Also return how many units were needed.
pub fn from_slice_start(src: &[u16]) -> Result<(Self,usize), Utf16SliceError> {
char::from_utf16_slice_start(src).map(|(_,len)| {
let second = if len==2 {src[1]} else {0};
(Utf16Char{ units: [src[0], second] }, len)
})
}
/// Store the first UTF-16 codepoint of the slice.
///
/// # Safety
///
/// The slice must be non-empty and start with a valid UTF-16 codepoint.
/// The length of the slice is never checked.
pub unsafe fn from_slice_start_unchecked(src: &[u16]) -> (Self,usize) {
unsafe {
let first = *src.get_unchecked(0);
if first.is_utf16_leading_surrogate() {
(Utf16Char{ units: [first, *src.get_unchecked(1)] }, 2)
} else {
(Utf16Char{ units: [first, 0] }, 1)
}
}
}
/// Validate and store an UTF-16 array as returned from `char.to_utf16_array()`.
///
/// # Examples
///
/// ```
/// use encode_unicode::Utf16Char;
/// use encode_unicode::error::Utf16ArrayError;
///
/// assert_eq!(Utf16Char::from_array(['x' as u16, 'y' as u16]), Ok(Utf16Char::from('x')));
/// assert_eq!(Utf16Char::from_array(['睷' as u16, 0]), Ok(Utf16Char::from('睷')));
/// assert_eq!(Utf16Char::from_array([0xda6f, 0xdcde]), Ok(Utf16Char::from('\u{abcde}')));
/// assert_eq!(Utf16Char::from_array([0xf111, 0xdbad]), Ok(Utf16Char::from('\u{f111}')));
/// assert_eq!(Utf16Char::from_array([0xdaaf, 0xdaaf]), Err(Utf16ArrayError::SecondIsNotTrailingSurrogate));
/// assert_eq!(Utf16Char::from_array([0xdcac, 0x9000]), Err(Utf16ArrayError::FirstIsTrailingSurrogate));
/// ```
pub const fn from_array(units: [u16; 2]) -> Result<Self,Utf16ArrayError> {
if (units[0] & 0xf8_00) != 0xd8_00 {
Ok(Utf16Char { units: [units[0], 0] })
} else if units[0] < 0xdc_00 && (units[1] & 0xfc_00) == 0xdc_00 {
Ok(Utf16Char { units })
} else if units[0] < 0xdc_00 {
Err(Utf16ArrayError::SecondIsNotTrailingSurrogate)
} else {
Err(Utf16ArrayError::FirstIsTrailingSurrogate)
}
}
/// Create an `Utf16Char` from an array as returned from `char.to_utf16_array()`.
///
/// # Safety
///
/// The units must form a valid codepoint, and the second unit must be 0
/// when a surrogate pair is not required.
/// Violating this can easily lead to undefined behavior, although unlike
/// `char` bad `Utf16Char`s simply existing is not immediately UB.
pub const unsafe fn from_array_unchecked(units: [u16; 2]) -> Self {
Utf16Char { units }
}
pub(crate) const fn validate_tuple(utf16: (u16,Option<u16>)) -> Result<(),Utf16TupleError> {
match utf16 {
(0x00_00..=0xd7_ff, None) | // single
(0xe0_00..=0xff_ff, None) | // single
(0xd8_00..=0xdb_ff, Some(0xdc_00..=0xdf_ff)) // correct surrogate
=> Ok(()),
(0xd8_00..=0xdb_ff, Some(_)) => Err(Utf16TupleError::SecondIsNotTrailingSurrogate),
(0xd8_00..=0xdb_ff, None ) => Err(Utf16TupleError::MissingSecond),
(0xdc_00..=0xdf_ff, _ ) => Err(Utf16TupleError::FirstIsTrailingSurrogate),
( _ , Some(_)) => Err(Utf16TupleError::SuperfluousSecond),
}
}
/// Validate and store a UTF-16 pair as returned from `char.to_utf16_tuple()`.
pub const fn from_tuple(utf16: (u16,Option<u16>)) -> Result<Self,Utf16TupleError> {
unsafe {
match Self::validate_tuple(utf16) {
Ok(()) => Ok(Self::from_tuple_unchecked(utf16)),
Err(e) => Err(e),
}
}
}
/// Create an `Utf16Char` from a tuple as returned from `char.to_utf16_tuple()`.
///
/// # Safety
///
/// The units must form a valid codepoint with the second being 0 when a
/// surrogate pair is not required.
/// Violating this can easily lead to undefined behavior.
pub const unsafe fn from_tuple_unchecked(utf16: (u16,Option<u16>)) -> Self {
let second = match utf16.1 {
Some(extra) => extra,
None => 0,
};
Utf16Char { units: [utf16.0, second] }
}
/// Create an `Utf16Char` from a single unit.
///
/// Codepoints less than `'\u{1_00_00}'` (which fit in an `u16`)
/// are part of the basic multilingual plane
/// unless they are reserved for surrogate pairs.
///
/// # Errors
///
/// Returns `NonBmpError` if the unit is in the range `0xd800..0xe000`
/// (which means that it's part of a surrogat pair)
///
/// # Examples
///
/// ```
/// # use encode_unicode::Utf16Char;
/// assert_eq!(Utf16Char::from_bmp(0x40).unwrap(), '@');
/// assert_eq!(Utf16Char::from_bmp('ø' as u16).unwrap(), 'ø');
/// assert!(Utf16Char::from_bmp(0xdddd).is_err());
/// ```
pub const fn from_bmp(bmp_codepoint: u16) -> Result<Self,NonBmpError> {
let is_not_bmp = bmp_codepoint & 0xf800 == 0xd800;
let if_good = Utf16Char{ units: [bmp_codepoint, 0] };
[Ok(if_good), Err(NonBmpError)][is_not_bmp as usize]
}
/// Create an `Utf16Char` from a single unit without checking that it's a
/// valid codepoint on its own.
///
/// # Safety
///
/// The unit must be less than 0xd800 or greater than 0xdfff.
/// In other words, not part of a surrogate pair.
/// Violating this can easily lead to undefined behavior.
#[inline]
pub const unsafe fn from_bmp_unchecked(bmp_codepoint: u16) -> Self {
Utf16Char{ units: [bmp_codepoint, 0] }
}
/// Checks that the codepoint is in the basic multilingual plane.
///
/// # Examples
/// ```
/// # use encode_unicode::Utf16Char;
/// assert_eq!(Utf16Char::from('e').is_bmp(), true);
/// assert_eq!(Utf16Char::from('€').is_bmp(), true);
/// assert_eq!(Utf16Char::from('𝔼').is_bmp(), false);
/// ```
#[inline]
pub const fn is_bmp(self) -> bool {
self.units[1] == 0
}
/// The number of units this character is made up of.
///
/// Is either 1 or 2 and identical to `.as_char().len_utf16()`
/// or `.as_ref().len()`.
#[inline]
pub const fn len(self) -> usize {
1 + (self.units[1] as usize >> 15)
}
// There is no `.is_emty()` because it would always return false.
/// Checks that the codepoint is an ASCII character.
#[inline]
pub const fn is_ascii(self) -> bool {
self.units[0] <= 127
}
/// Checks that two characters are an ASCII case-insensitive match.
///
/// Is equivalent to `a.to_ascii_lowercase() == b.to_ascii_lowercase()`.
pub const fn eq_ignore_ascii_case(&self, other: &Self) -> bool {
if self.is_ascii() && other.is_ascii() {
(self.units[0] as u8).eq_ignore_ascii_case(&(other.units[0] as u8))
} else {
self.units[0] == other.units[0] && self.units[1] == other.units[1]
}
}
/// Converts the character to its ASCII upper case equivalent.
///
/// ASCII letters 'a' to 'z' are mapped to 'A' to 'Z',
/// but non-ASCII letters are unchanged.
pub const fn to_ascii_uppercase(self) -> Self {
let n = self.units[0].wrapping_sub(b'a' as u16);
if n < 26 {Utf16Char{ units: [n+b'A' as u16, 0] }}
else {self}
}
/// Converts the character to its ASCII lower case equivalent.
///
/// ASCII letters 'A' to 'Z' are mapped to 'a' to 'z',
/// but non-ASCII letters are unchanged.
pub const fn to_ascii_lowercase(self) -> Self {
let n = self.units[0].wrapping_sub(b'A' as u16);
if n < 26 {Utf16Char{ units: [n+b'a' as u16, 0] }}
else {self}
}
/// Converts the character to its ASCII upper case equivalent in-place.
///
/// ASCII letters 'a' to 'z' are mapped to 'A' to 'Z',
/// but non-ASCII letters are unchanged.
pub fn make_ascii_uppercase(&mut self) {
*self = self.to_ascii_uppercase()
}
/// Converts the character to its ASCII lower case equivalent in-place.
///
/// ASCII letters 'A' to 'Z' are mapped to 'a' to 'z',
/// but non-ASCII letters are unchanged.
pub fn make_ascii_lowercase(&mut self) {
*self = self.to_ascii_lowercase();
}
/// Convert from UTF-16 to UTF-32
pub fn to_char(self) -> char {
self.into()
}
/// Write the internal representation to a slice,
/// and then returns the number of `u16`s written.
///
/// # Panics
/// Will panic the buffer is too small;
/// You can get the required length from `.len()`,
/// but a buffer of length two is always large enough.
pub fn to_slice(self, dst: &mut[u16]) -> usize {
// Write the last unit first to avoid repeated length checks.
let extra = self.units[1] as usize >> 15;
match dst.get_mut(extra) {
Some(first) => *first = self.units[extra],
None => panic!("The provided buffer is too small.")
}
if extra != 0 {dst[0] = self.units[0];}
extra+1
}
/// Get the character represented as an array of two units.
///
/// The second `u16` is zero for codepoints that fit in one unit.
#[inline]
pub const fn to_array(self) -> [u16;2] {
self.units
}
/// The second `u16` is used for surrogate pairs.
#[inline]
pub const fn to_tuple(self) -> (u16,Option<u16>) {
(self.units[0], [None, Some(self.units[1])][self.units[1] as usize >> 15])
}
}

View File

@@ -0,0 +1,265 @@
/* Copyright 2018-2019 Torbjørn Birch Moltu
*
* Licensed under the Apache License, Version 2.0, <LICENSE-APACHE or
* http://apache.org/licenses/LICENSE-2.0> or the MIT license <LICENSE-MIT or
* http://opensource.org/licenses/MIT>, at your option. This file may not be
* copied, modified, or distributed except according to those terms.
*/
use crate::traits::CharExt;
use crate::utf16_char::Utf16Char;
use crate::errors::EmptyStrError;
extern crate core;
use core::fmt;
use core::borrow::Borrow;
// Invalid values that says the field is consumed or empty.
const FIRST_USED: u16 = 0x_dc_00;
const SECOND_USED: u16 = 0;
/// Iterate over the units of the UTF-16 representation of a codepoint.
#[derive(Clone)]
pub struct Utf16Iterator {
first: u16,
second: u16,
}
impl From<char> for Utf16Iterator {
fn from(c: char) -> Self {
Self::from(c.to_utf16())
}
}
impl From<Utf16Char> for Utf16Iterator {
fn from(uc: Utf16Char) -> Self {
let (first, second) = uc.to_tuple();
let second = second.unwrap_or(SECOND_USED);
Utf16Iterator{first, second}
}
}
impl Iterator for Utf16Iterator {
type Item=u16;
fn next(&mut self) -> Option<u16> {
match (self.first, self.second) {
(FIRST_USED, SECOND_USED) => { None },
(FIRST_USED, second ) => {self.second = SECOND_USED; Some(second)},
(first , _ ) => {self.first = FIRST_USED; Some(first )},
}
}
fn size_hint(&self) -> (usize, Option<usize>) {
(self.len(), Some(self.len()))
}
}
impl ExactSizeIterator for Utf16Iterator {
fn len(&self) -> usize {
(if self.first == FIRST_USED {0} else {1}) +
(if self.second == SECOND_USED {0} else {1})
}
}
impl fmt::Debug for Utf16Iterator {
fn fmt(&self, fmtr: &mut fmt::Formatter) -> fmt::Result {
let mut clone = self.clone();
match (clone.next(), clone.next()) {
(Some(one), None) => write!(fmtr, "[{}]", one),
(Some(a), Some(b)) => write!(fmtr, "[{}, {}]", a, b),
(None, _) => write!(fmtr, "[]"),
}
}
}
/// Converts an iterator of `Utf16Char` (or `&Utf16Char`)
/// to an iterator of `u16`s.
///
/// Is equivalent to calling `.flatten()` or `.flat_map()` on the original iterator,
/// but the returned iterator is about twice as fast.
///
/// The exact number of units cannot be known in advance, but `size_hint()`
/// gives the possible range.
///
/// # Examples
///
/// From iterator of values:
///
/// ```
/// use encode_unicode::{IterExt, CharExt};
///
/// let iterator = "foo".chars().map(|c| c.to_utf16() );
/// let mut units = [0; 4];
/// iterator.to_units().zip(&mut units).for_each(|(u,dst)| *dst = u );
/// assert_eq!(units, ['f' as u16, 'o' as u16, 'o' as u16, 0]);
/// ```
///
/// From iterator of references:
///
#[cfg_attr(feature="std", doc=" ```")]
#[cfg_attr(not(feature="std"), doc=" ```no_compile")]
/// use encode_unicode::{IterExt, CharExt, Utf16Char};
///
/// // (💣 takes two units)
/// let chars: Vec<Utf16Char> = "💣 bomb 💣".chars().map(|c| c.to_utf16() ).collect();
/// let units: Vec<u16> = chars.iter().to_units().collect();
/// let flat_map: Vec<u16> = chars.iter().cloned().flatten().collect();
/// assert_eq!(units, flat_map);
/// ```
#[derive(Clone)]
pub struct Utf16CharSplitter<U:Borrow<Utf16Char>, I:Iterator<Item=U>> {
inner: I,
prev_second: u16,
}
impl<U:Borrow<Utf16Char>, I:IntoIterator<Item=U>>
From<I> for Utf16CharSplitter<U, I::IntoIter> {
fn from(iterable: I) -> Self {
Utf16CharSplitter { inner: iterable.into_iter(), prev_second: 0 }
}
}
impl<U:Borrow<Utf16Char>, I:Iterator<Item=U>> Utf16CharSplitter<U,I> {
/// Extracts the source iterator.
///
/// Note that `iter.into_inner().to_units()` is not a no-op:
/// If the last returned unit from `next()` was a leading surrogate,
/// the trailing surrogate is lost.
pub fn into_inner(self) -> I {
self.inner
}
}
impl<U:Borrow<Utf16Char>, I:Iterator<Item=U>> Iterator for Utf16CharSplitter<U,I> {
type Item = u16;
fn next(&mut self) -> Option<Self::Item> {
if self.prev_second == 0 {
self.inner.next().map(|u16c| {
let units = u16c.borrow().to_array();
self.prev_second = units[1];
units[0]
})
} else {
let prev_second = self.prev_second;
self.prev_second = 0;
Some(prev_second)
}
}
fn size_hint(&self) -> (usize,Option<usize>) {
// Doesn't need to handle unlikely overflows correctly because
// size_hint() cannot be relied upon anyway. (the trait isn't unsafe)
let (min, max) = self.inner.size_hint();
let add = if self.prev_second == 0 {0} else {1};
(min.wrapping_add(add), max.map(|max| max.wrapping_mul(2).wrapping_add(add) ))
}
}
/// An iterator over the codepoints in a `str` represented as `Utf16Char`.
#[derive(Clone)]
pub struct Utf16CharIndices<'a>{
str: &'a str,
index: usize,
}
impl<'a> From<&'a str> for Utf16CharIndices<'a> {
fn from(s: &str) -> Utf16CharIndices {
Utf16CharIndices{str: s, index: 0}
}
}
impl<'a> Utf16CharIndices<'a> {
/// Extract the remainder of the source `str`.
///
/// # Examples
///
/// ```
/// use encode_unicode::{StrExt, Utf16Char};
/// let mut iter = "abc".utf16char_indices();
/// assert_eq!(iter.next_back(), Some((2, Utf16Char::from('c'))));
/// assert_eq!(iter.next(), Some((0, Utf16Char::from('a'))));
/// assert_eq!(iter.as_str(), "b");
/// ```
pub fn as_str(&self) -> &'a str {
&self.str[self.index..]
}
}
impl<'a> Iterator for Utf16CharIndices<'a> {
type Item = (usize,Utf16Char);
fn next(&mut self) -> Option<(usize,Utf16Char)> {
match Utf16Char::from_str_start(&self.str[self.index..]) {
Ok((u16c, bytes)) => {
let item = (self.index, u16c);
self.index += bytes;
Some(item)
},
Err(EmptyStrError) => None
}
}
fn size_hint(&self) -> (usize,Option<usize>) {
let len = self.str.len() - self.index;
// For len+3 to overflow, the slice must fill all but two bytes of
// addressable memory, and size_hint() doesn't need to be correct.
(len.wrapping_add(3)/4, Some(len))
}
}
impl<'a> DoubleEndedIterator for Utf16CharIndices<'a> {
fn next_back(&mut self) -> Option<(usize,Utf16Char)> {
if self.index < self.str.len() {
let rev = self.str.bytes().rev();
let len = 1 + rev.take_while(|b| b & 0b1100_0000 == 0b1000_0000 ).count();
let starts = self.str.len() - len;
let (u16c,_) = Utf16Char::from_str_start(&self.str[starts..]).unwrap();
self.str = &self.str[..starts];
Some((starts, u16c))
} else {
None
}
}
}
impl<'a> fmt::Debug for Utf16CharIndices<'a> {
fn fmt(&self, fmtr: &mut fmt::Formatter) -> fmt::Result {
fmtr.debug_tuple("Utf16CharIndices")
.field(&self.index)
.field(&self.as_str())
.finish()
}
}
/// An iterator over the codepoints in a `str` represented as `Utf16Char`.
#[derive(Clone)]
pub struct Utf16Chars<'a>(Utf16CharIndices<'a>);
impl<'a> From<&'a str> for Utf16Chars<'a> {
fn from(s: &str) -> Utf16Chars {
Utf16Chars(Utf16CharIndices::from(s))
}
}
impl<'a> Utf16Chars<'a> {
/// Extract the remainder of the source `str`.
///
/// # Examples
///
/// ```
/// use encode_unicode::{StrExt, Utf16Char};
/// let mut iter = "abc".utf16chars();
/// assert_eq!(iter.next(), Some(Utf16Char::from('a')));
/// assert_eq!(iter.next_back(), Some(Utf16Char::from('c')));
/// assert_eq!(iter.as_str(), "b");
/// ```
pub fn as_str(&self) -> &'a str {
self.0.as_str()
}
}
impl<'a> Iterator for Utf16Chars<'a> {
type Item = Utf16Char;
fn next(&mut self) -> Option<Utf16Char> {
self.0.next().map(|(_,u16c)| u16c )
}
fn size_hint(&self) -> (usize,Option<usize>) {
self.0.size_hint()
}
}
impl<'a> DoubleEndedIterator for Utf16Chars<'a> {
fn next_back(&mut self) -> Option<Utf16Char> {
self.0.next_back().map(|(_,u16c)| u16c )
}
}
impl<'a> fmt::Debug for Utf16Chars<'a> {
fn fmt(&self, fmtr: &mut fmt::Formatter) -> fmt::Result {
fmtr.debug_tuple("Utf16Chars")
.field(&self.as_str())
.finish()
}
}

647
vendor/encode_unicode/src/utf8_char.rs vendored Normal file
View File

@@ -0,0 +1,647 @@
/* Copyright 2016-2022 Torbjørn Birch Moltu
*
* Licensed under the Apache License, Version 2.0, <LICENSE-APACHE or
* http://apache.org/licenses/LICENSE-2.0> or the MIT license <LICENSE-MIT or
* http://opensource.org/licenses/MIT>, at your option. This file may not be
* copied, modified, or distributed except according to those terms.
*/
use crate::errors::{FromStrError, EmptyStrError, NonAsciiError, Utf8Error};
use crate::utf8_iterators::Utf8Iterator;
use crate::traits::{CharExt, U8UtfExt};
use crate::utf16_char::Utf16Char;
extern crate core;
use core::{hash, fmt, str, ptr};
use core::cmp::Ordering;
use core::borrow::Borrow;
use core::ops::Deref;
#[cfg(feature="std")]
use core::iter::FromIterator;
#[cfg(feature="ascii")]
extern crate ascii;
#[cfg(feature="ascii")]
use ascii::{AsciiChar,ToAsciiChar,ToAsciiCharError};
// I don't think there is any good default value for char, but char does.
#[derive(Default)]
// char doesn't do anything more advanced than u32 for Eq/Ord, so we shouldn't either.
// The default impl of Ord for arrays works out because longer codepoints
// start with more ones, so if they're equal, the length is the same,
// breaks down for values above 0x1f_ff_ff but those can only be created by unsafe code.
#[derive(PartialEq,Eq, PartialOrd,Ord)]
#[derive(Clone,Copy)]
/// An unicode codepoint stored as UTF-8.
///
/// It can be borrowed as a `str`, and has the same size as `char`.
pub struct Utf8Char {
bytes: [u8; 4],
}
/////////////////////
//conversion traits//
/////////////////////
impl str::FromStr for Utf8Char {
type Err = FromStrError;
/// Create an `Utf8Char` from a string slice.
/// The string must contain exactly one codepoint.
///
/// # Examples
///
/// ```
/// use encode_unicode::error::FromStrError::*;
/// use encode_unicode::Utf8Char;
/// use std::str::FromStr;
///
/// assert_eq!(Utf8Char::from_str("a"), Ok(Utf8Char::from('a')));
/// assert_eq!(Utf8Char::from_str("🂠"), Ok(Utf8Char::from('🂠')));
/// assert_eq!(Utf8Char::from_str(""), Err(Empty));
/// assert_eq!(Utf8Char::from_str("ab"), Err(MultipleCodepoints));
/// assert_eq!(Utf8Char::from_str("é"), Err(MultipleCodepoints));// 'e'+u301 combining mark
/// ```
fn from_str(s: &str) -> Result<Self, FromStrError> {
if s.is_empty() {
Err(FromStrError::Empty)
} else if s.len() != 1+s.as_bytes()[0].extra_utf8_bytes_unchecked() {
Err(FromStrError::MultipleCodepoints)
} else {
let mut bytes = [0; 4];
bytes[..s.len()].copy_from_slice(s.as_bytes());
Ok(Utf8Char{bytes})
}
}
}
impl From<Utf16Char> for Utf8Char {
fn from(utf16: Utf16Char) -> Utf8Char {
match utf16.to_tuple() {
(ascii @ 0..=0x00_7f, _) => {
Utf8Char{ bytes: [ascii as u8, 0, 0, 0] }
},
(unit @ 0..=0x07_ff, _) => {
let byte2 = 0x80 | (unit & 0x00_3f) as u8;
let byte1 = 0xc0 | ((unit & 0x07_c0) >> 6) as u8;
Utf8Char{ bytes: [byte1, byte2, 0, 0] }
},
(unit, None) => {
let byte3 = 0x80 | (unit & 0x00_3f) as u8;
let byte2 = 0x80 | ((unit & 0x0f_c0) >> 6) as u8;
let byte1 = 0xe0 | ((unit & 0xf0_00) >> 12) as u8;
Utf8Char{ bytes: [byte1, byte2, byte3, 0] }
},
(first, Some(second)) => {
let first = first + (0x01_00_00u32 >> 10) as u16;
let byte4 = 0x80 | (second & 0x00_3f) as u8;
let byte3 = 0x80 | ((second & 0x03_c0) >> 6) as u8
| (( first & 0x00_03) << 4) as u8;
let byte2 = 0x80 | (( first & 0x00_fc) >> 2) as u8;
let byte1 = 0xf0 | (( first & 0x07_00) >> 8) as u8;
Utf8Char{ bytes: [byte1, byte2, byte3, byte4] }
}
}
}
}
impl From<char> for Utf8Char {
fn from(c: char) -> Self {
Utf8Char::new(c)
}
}
impl From<Utf8Char> for char {
fn from(uc: Utf8Char) -> char {
uc.to_char()
}
}
impl IntoIterator for Utf8Char {
type Item=u8;
type IntoIter=Utf8Iterator;
/// Iterate over the byte values.
fn into_iter(self) -> Utf8Iterator {
Utf8Iterator::from(self)
}
}
#[cfg(feature="std")]
impl Extend<Utf8Char> for Vec<u8> {
fn extend<I:IntoIterator<Item=Utf8Char>>(&mut self, iter: I) {
let iter = iter.into_iter();
self.reserve(iter.size_hint().0);
for u8c in iter {
// twice as fast as self.extend_from_slice(u8c.as_bytes());
self.push(u8c.bytes[0]);
for &extra in &u8c.bytes[1..] {
if extra != 0 {
self.push(extra);
}
}
}
}
}
#[cfg(feature="std")]
impl<'a> Extend<&'a Utf8Char> for Vec<u8> {
fn extend<I:IntoIterator<Item=&'a Utf8Char>>(&mut self, iter: I) {
self.extend(iter.into_iter().cloned())
}
}
#[cfg(feature="std")]
impl Extend<Utf8Char> for String {
fn extend<I:IntoIterator<Item=Utf8Char>>(&mut self, iter: I) {
unsafe { self.as_mut_vec().extend(iter) }
}
}
#[cfg(feature="std")]
impl<'a> Extend<&'a Utf8Char> for String {
fn extend<I:IntoIterator<Item=&'a Utf8Char>>(&mut self, iter: I) {
self.extend(iter.into_iter().cloned())
}
}
#[cfg(feature="std")]
impl FromIterator<Utf8Char> for String {
fn from_iter<I:IntoIterator<Item=Utf8Char>>(iter: I) -> String {
let mut string = String::new();
string.extend(iter);
return string;
}
}
#[cfg(feature="std")]
impl<'a> FromIterator<&'a Utf8Char> for String {
fn from_iter<I:IntoIterator<Item=&'a Utf8Char>>(iter: I) -> String {
iter.into_iter().cloned().collect()
}
}
#[cfg(feature="std")]
impl FromIterator<Utf8Char> for Vec<u8> {
fn from_iter<I:IntoIterator<Item=Utf8Char>>(iter: I) -> Self {
iter.into_iter().collect::<String>().into_bytes()
}
}
#[cfg(feature="std")]
impl<'a> FromIterator<&'a Utf8Char> for Vec<u8> {
fn from_iter<I:IntoIterator<Item=&'a Utf8Char>>(iter: I) -> Self {
iter.into_iter().cloned().collect::<String>().into_bytes()
}
}
/////////////////
//getter traits//
/////////////////
impl AsRef<[u8]> for Utf8Char {
fn as_ref(&self) -> &[u8] {
&self.bytes[..self.len()]
}
}
impl AsRef<str> for Utf8Char {
fn as_ref(&self) -> &str {
unsafe{ str::from_utf8_unchecked( self.as_ref() ) }
}
}
impl Borrow<[u8]> for Utf8Char {
fn borrow(&self) -> &[u8] {
self.as_ref()
}
}
impl Borrow<str> for Utf8Char {
fn borrow(&self) -> &str {
self.as_ref()
}
}
impl Deref for Utf8Char {
type Target = str;
fn deref(&self) -> &Self::Target {
self.as_ref()
}
}
////////////////
//ascii traits//
////////////////
#[cfg(feature="ascii")]
/// Requires the feature "ascii".
impl From<AsciiChar> for Utf8Char {
fn from(ac: AsciiChar) -> Self {
Utf8Char{ bytes: [ac.as_byte(),0,0,0] }
}
}
#[cfg(feature="ascii")]
/// Requires the feature "ascii".
impl ToAsciiChar for Utf8Char {
fn to_ascii_char(self) -> Result<AsciiChar, ToAsciiCharError> {
self.bytes[0].to_ascii_char()
}
unsafe fn to_ascii_char_unchecked(self) -> AsciiChar {
unsafe { self.bytes[0].to_ascii_char_unchecked() }
}
}
/////////////////////////////////////////////////////////
//Genaral traits that cannot be derived to emulate char//
/////////////////////////////////////////////////////////
impl hash::Hash for Utf8Char {
fn hash<H : hash::Hasher>(&self, state: &mut H) {
self.to_char().hash(state);
}
}
impl fmt::Debug for Utf8Char {
fn fmt(&self, fmtr: &mut fmt::Formatter) -> fmt::Result {
fmt::Debug::fmt(&self.to_char(), fmtr)
}
}
impl fmt::Display for Utf8Char {
fn fmt(&self, fmtr: &mut fmt::Formatter) -> fmt::Result {
fmtr.write_str(self.as_str())
}
}
////////////////////////////////
//Comparisons with other types//
////////////////////////////////
impl PartialEq<char> for Utf8Char {
fn eq(&self, u32c: &char) -> bool {
*self == Utf8Char::from(*u32c)
}
}
impl PartialEq<Utf8Char> for char {
fn eq(&self, u8c: &Utf8Char) -> bool {
Utf8Char::from(*self) == *u8c
}
}
impl PartialOrd<char> for Utf8Char {
fn partial_cmp(&self, u32c: &char) -> Option<Ordering> {
self.partial_cmp(&Self::from(*u32c))
}
}
impl PartialOrd<Utf8Char> for char {
fn partial_cmp(&self, u8c: &Utf8Char) -> Option<Ordering> {
Utf8Char::from(*self).partial_cmp(u8c)
}
}
impl PartialEq<Utf16Char> for Utf8Char {
fn eq(&self, u16c: &Utf16Char) -> bool {
*self == Self::from(*u16c)
}
}
impl PartialOrd<Utf16Char> for Utf8Char {
fn partial_cmp(&self, u16c: &Utf16Char) -> Option<Ordering> {
self.partial_cmp(&Self::from(*u16c))
}
}
// The other direction is implemented in utf16_char.rs
/// Only considers the byte equal if both it and the `Utf8Char` represents ASCII characters.
///
/// There is no impl in the opposite direction, as this should only be used to
/// compare `Utf8Char`s against constants.
///
/// # Examples
///
/// ```
/// # use encode_unicode::Utf8Char;
/// assert!(Utf8Char::from('8') == b'8');
/// assert!(Utf8Char::from_array([0xf1,0x80,0x80,0x80]).unwrap() != 0xf1);
/// assert!(Utf8Char::from('\u{ff}') != 0xff);
/// assert!(Utf8Char::from('\u{80}') != 0x80);
/// ```
impl PartialEq<u8> for Utf8Char {
fn eq(&self, byte: &u8) -> bool {
self.bytes[0] == *byte && self.bytes[1] == 0
}
}
#[cfg(feature = "ascii")]
/// `Utf8Char`s that are not ASCII never compare equal.
impl PartialEq<AsciiChar> for Utf8Char {
#[inline]
fn eq(&self, ascii: &AsciiChar) -> bool {
self.bytes[0] == *ascii as u8
}
}
#[cfg(feature = "ascii")]
/// `Utf8Char`s that are not ASCII never compare equal.
impl PartialEq<Utf8Char> for AsciiChar {
#[inline]
fn eq(&self, u8c: &Utf8Char) -> bool {
u8c == self
}
}
#[cfg(feature = "ascii")]
/// `Utf8Char`s that are not ASCII always compare greater.
impl PartialOrd<AsciiChar> for Utf8Char {
#[inline]
fn partial_cmp(&self, ascii: &AsciiChar) -> Option<Ordering> {
self.bytes[0].partial_cmp(ascii)
}
}
#[cfg(feature = "ascii")]
/// `Utf8Char`s that are not ASCII always compare greater.
impl PartialOrd<Utf8Char> for AsciiChar {
#[inline]
fn partial_cmp(&self, u8c: &Utf8Char) -> Option<Ordering> {
self.partial_cmp(&u8c.bytes[0])
}
}
///////////////////////////////////////////////////////
//pub impls that should be together for nicer rustdoc//
///////////////////////////////////////////////////////
impl Utf8Char {
/// A `const fn` alternative to the trait-based `Utf8Char::from(char)`.
///
/// # Example
///
/// ```
/// # use encode_unicode::Utf8Char;
/// const REPLACEMENT_CHARACTER: Utf8Char = Utf8Char::new('\u{fffd}');
/// ```
pub const fn new(c: char) -> Self {
if c.is_ascii() {
Utf8Char{bytes: [c as u8, 0, 0, 0]}
} else {
// How many extra UTF-8 bytes that are needed to represent an
// UTF-32 codepoint with a number of bits.
// Stored as a bit-packed array using two bits per value.
// 0..=7 bits = no extra bytes
// +4 = 8..=11 bits = one xtra byte (5+6 bits)
// +5 = 12..=16 bits = two extra bytes (4+6+6 bits)
// +5 = 17..=21 bits = three extra bytes (3+6+6+6 bits)
const EXTRA_BYTES: u64 = 0b11_11_11_11_11__10_10_10_10_10__01_01_01_01__00_00_00_00_00_00_00__00;
let bits_used = 32 - (c as u32).leading_zeros();
let len = 1 + ((EXTRA_BYTES >> (bits_used*2)) & 0b11);
// copied from CharExt::to_utf8_array()
let mut c = c as u32;
let mut parts = 0;// convert to 6-bit bytes
parts |= c & 0x3f; c>>=6;
parts<<=8; parts |= c & 0x3f; c>>=6;
parts<<=8; parts |= c & 0x3f; c>>=6;
parts<<=8; parts |= c & 0x3f;
parts |= 0x80_80_80_80;// set the most significant bit
parts >>= 8*(4-len);// right-align bytes
// Now, unused bytes are zero, (which matters for Utf8Char.eq())
// and the rest are 0b10xx_xxxx
// set header on first byte
parts |= (0xff_00u32 >> len) & 0xff;// store length
parts &= !(1u32 << (7-len));// clear the next bit after it
Utf8Char {bytes: parts.to_le_bytes()}
}
}
/// Create an `Utf8Char` from the first codepoint in a `str`.
///
/// Returns an error if the `str` is empty.
///
/// # Examples
///
/// ```
/// use encode_unicode::Utf8Char;
///
/// assert_eq!(Utf8Char::from_str_start("a"), Ok((Utf8Char::from('a'),1)));
/// assert_eq!(Utf8Char::from_str_start("ab"), Ok((Utf8Char::from('a'),1)));
/// assert_eq!(Utf8Char::from_str_start("🂠 "), Ok((Utf8Char::from('🂠'),4)));
/// assert_eq!(Utf8Char::from_str_start("é"), Ok((Utf8Char::from('e'),1)));// 'e'+u301 combining mark
/// assert!(Utf8Char::from_str_start("").is_err());
/// ```
pub fn from_str_start(src: &str) -> Result<(Self,usize),EmptyStrError> {
unsafe {
if src.is_empty() {
Err(EmptyStrError)
} else {
Ok(Utf8Char::from_slice_start_unchecked(src.as_bytes()))
}
}
}
/// Create an `Utf8Char` of the first codepoint in an UTF-8 slice.
/// Also returns the length of the UTF-8 sequence for the codepoint.
///
/// If the slice is from a `str`, use `::from_str_start()` to skip UTF-8 validation.
///
/// # Errors
///
/// Returns an `Err` if the slice is empty, doesn't start with a valid
/// UTF-8 sequence or is too short for the sequence.
///
/// # Examples
///
/// ```
/// use encode_unicode::Utf8Char;
/// use encode_unicode::error::Utf8ErrorKind::*;
///
/// assert_eq!(Utf8Char::from_slice_start(&[b'A', b'B', b'C']), Ok((Utf8Char::from('A'),1)));
/// assert_eq!(Utf8Char::from_slice_start(&[0xdd, 0xbb]), Ok((Utf8Char::from('\u{77b}'),2)));
///
/// assert_eq!(Utf8Char::from_slice_start(&[]).unwrap_err().kind(), TooFewBytes);
/// assert_eq!(Utf8Char::from_slice_start(&[0xf0, 0x99]).unwrap_err().kind(), TooFewBytes);
/// assert_eq!(Utf8Char::from_slice_start(&[0xee, b'F', 0x80]).unwrap_err().kind(), InterruptedSequence);
/// assert_eq!(Utf8Char::from_slice_start(&[0xee, 0x99, 0x0f]).unwrap_err().kind(), InterruptedSequence);
/// ```
pub fn from_slice_start(src: &[u8]) -> Result<(Self,usize),Utf8Error> {
char::from_utf8_slice_start(src).map(|(_,len)| {
let mut bytes = [0; 4];
bytes[..len].copy_from_slice(&src[..len]);
(Utf8Char{bytes}, len)
})
}
/// A `from_slice_start()` that doesn't validate the codepoint.
///
/// # Safety
///
/// The slice must be non-empty and start with a valid UTF-8 codepoint.
/// Invalid or incomplete values might cause reads of uninitalized memory.
pub unsafe fn from_slice_start_unchecked(src: &[u8]) -> (Self,usize) {
unsafe {
let len = 1+src.get_unchecked(0).extra_utf8_bytes_unchecked();
let mut bytes = [0; 4];
ptr::copy_nonoverlapping(src.as_ptr(), bytes.as_mut_ptr() as *mut u8, len);
(Utf8Char{bytes}, len)
}
}
/// Create an `Utf8Char` from a byte array after validating it.
///
/// The codepoint must start at the first byte.
/// Unused bytes are set to zero by this function and so can be anything.
///
/// # Errors
///
/// Returns an `Err` if the array doesn't start with a valid UTF-8 sequence.
///
/// # Examples
///
/// ```
/// use encode_unicode::Utf8Char;
/// use encode_unicode::error::Utf8ErrorKind::*;
///
/// assert_eq!(Utf8Char::from_array([b'A', 0, 0, 0]), Ok(Utf8Char::from('A')));
/// assert_eq!(Utf8Char::from_array([0xf4, 0x8b, 0xbb, 0xbb]), Ok(Utf8Char::from('\u{10befb}')));
/// assert_eq!(Utf8Char::from_array([b'A', b'B', b'C', b'D']), Ok(Utf8Char::from('A')));
/// assert_eq!(Utf8Char::from_array([0, 0, 0xcc, 0xbb]), Ok(Utf8Char::from('\0')));
///
/// assert_eq!(Utf8Char::from_array([0xef, b'F', 0x80, 0x80]).unwrap_err().kind(), InterruptedSequence);
/// assert_eq!(Utf8Char::from_array([0xc1, 0x80, 0, 0]).unwrap_err().kind(), NonUtf8Byte);
/// assert_eq!(Utf8Char::from_array([0xe0, 0x9a, 0xbf, 0]).unwrap_err().kind(), OverlongEncoding);
/// assert_eq!(Utf8Char::from_array([0xf4, 0xaa, 0x99, 0x88]).unwrap_err().kind(), TooHighCodepoint);
/// ```
pub fn from_array(utf8: [u8;4]) -> Result<Self,Utf8Error> {
// perform all validation
char::from_utf8_array(utf8)?;
let extra = utf8[0].extra_utf8_bytes_unchecked() as u32;
// zero unused bytes in one operation by transmuting the arrary to
// u32, apply an endian-corrected mask and transmute back
let mask = u32::from_le(0xff_ff_ff_ff >> (8*(3-extra)));
let unused_zeroed = mask & u32::from_ne_bytes(utf8); // native endian
Ok(Utf8Char{ bytes: unused_zeroed.to_ne_bytes() })
}
/// Zero-cost constructor.
///
/// # Safety
///
/// Must contain a valid codepoint starting at the first byte, with the
/// unused bytes zeroed.
/// Bad values can easily lead to undefined behavior.
#[inline]
pub const unsafe fn from_array_unchecked(utf8: [u8;4]) -> Self {
Utf8Char{ bytes: utf8 }
}
/// Create an `Utf8Char` from a single byte.
///
/// The byte must be an ASCII character.
///
/// # Errors
///
/// Returns `NonAsciiError` if the byte greater than 127.
///
/// # Examples
///
/// ```
/// # use encode_unicode::Utf8Char;
/// assert_eq!(Utf8Char::from_ascii(b'a').unwrap(), 'a');
/// assert!(Utf8Char::from_ascii(128).is_err());
/// ```
pub const fn from_ascii(ascii: u8) -> Result<Self,NonAsciiError> {
[Ok(Utf8Char{ bytes: [ascii, 0, 0, 0] }), Err(NonAsciiError)][(ascii >> 7) as usize]
}
/// Create an `Utf8Char` from a single byte without checking that it's a
/// valid codepoint on its own, which is only true for ASCII characters.
///
/// # Safety
///
/// The byte must be less than 128.
#[inline]
pub const unsafe fn from_ascii_unchecked(ascii: u8) -> Self {
Utf8Char{ bytes: [ascii, 0, 0, 0] }
}
/// The number of bytes this character needs.
///
/// Is between 1 and 4 (inclusive) and identical to `.as_ref().len()` or
/// `.as_char().len_utf8()`.
#[inline]
pub const fn len(self) -> usize {
// Invariants of the extra bytes enambles algorithms that
// `u8.extra_utf8_bytes_unchecked()` cannot use.
// Some of them turned out to require fewer x86 instructions:
// Exploits that unused bytes are zero and calculates the number of
// trailing zero bytes.
// Setting a bit in the first byte prevents the function from returning
// 0 for '\0' (which has 32 leading zeros).
// trailing and leading is swapped below to optimize for little-endian
// architectures.
(4 - (u32::from_le_bytes(self.bytes)|1).leading_zeros()/8) as usize
// Exploits that the extra bytes have their most significant bit set if
// in use.
// Takes fewer instructions than the one above if popcnt can be used,
// (which it cannot by default,
// set RUSTFLAGS='-C target-cpu=native' to enable)
//let all = u32::from_ne_bytes(self.bytes);
//let msb_mask = u32::from_be(0x00808080);
//let add_one = u32::from_be(0x80000000);
//((all & msb_mask) | add_one).count_ones() as usize
}
// There is no .is_emty() because this type is never empty.
/// Checks that the codepoint is an ASCII character.
pub const fn is_ascii(self) -> bool {
self.bytes[0].is_ascii()
}
/// Checks that two characters are an ASCII case-insensitive match.
///
/// Is equivalent to `a.to_ascii_lowercase() == b.to_ascii_lowercase()`.
pub const fn eq_ignore_ascii_case(&self, other: &Self) -> bool {
if self.is_ascii() {
self.bytes[0].eq_ignore_ascii_case(&other.bytes[0])
} else {
// [u8; 4] can't be const compared as of Rust 1.60, but u32 can
u32::from_le_bytes(self.bytes) == u32::from_le_bytes(other.bytes)
}
}
/// Converts the character to its ASCII upper case equivalent.
///
/// ASCII letters 'a' to 'z' are mapped to 'A' to 'Z',
/// but non-ASCII letters are unchanged.
pub const fn to_ascii_uppercase(mut self) -> Self {
self.bytes[0] = self.bytes[0].to_ascii_uppercase();
self
}
/// Converts the character to its ASCII lower case equivalent.
///
/// ASCII letters 'A' to 'Z' are mapped to 'a' to 'z',
/// but non-ASCII letters are unchanged.
pub const fn to_ascii_lowercase(mut self) -> Self {
self.bytes[0] = self.bytes[0].to_ascii_lowercase();
self
}
/// Converts the character to its ASCII upper case equivalent in-place.
///
/// ASCII letters 'a' to 'z' are mapped to 'A' to 'Z',
/// but non-ASCII letters are unchanged.
#[inline]
pub fn make_ascii_uppercase(&mut self) {
self.bytes[0].make_ascii_uppercase()
}
/// Converts the character to its ASCII lower case equivalent in-place.
///
/// ASCII letters 'A' to 'Z' are mapped to 'a' to 'z',
/// but non-ASCII letters are unchanged.
#[inline]
pub fn make_ascii_lowercase(&mut self) {
self.bytes[0].make_ascii_lowercase();
}
/// Convert from UTF-8 to UTF-32
pub fn to_char(self) -> char {
unsafe { char::from_utf8_exact_slice_unchecked(&self.bytes[..self.len()]) }
}
/// Write the internal representation to a slice,
/// and then returns the number of bytes written.
///
/// # Panics
///
/// Will panic the buffer is too small;
/// You can get the required length from `.len()`,
/// but a buffer of length four is always large enough.
pub fn to_slice(self, dst: &mut[u8]) -> usize {
if self.len() > dst.len() {
panic!("The provided buffer is too small.");
}
dst[..self.len()].copy_from_slice(&self.bytes[..self.len()]);
self.len()
}
/// Expose the internal array and the number of used bytes.
pub const fn to_array(self) -> ([u8;4],usize) {
(self.bytes, self.len())
}
/// Return a `str` view of the array the codepoint is stored as.
///
/// Is an unambiguous version of `.as_ref()`.
pub fn as_str(&self) -> &str {
self.deref()
}
}

View File

@@ -0,0 +1,346 @@
/* Copyright 2018-2020 Torbjørn Birch Moltu
*
* Licensed under the Apache License, Version 2.0, <LICENSE-APACHE or
* http://apache.org/licenses/LICENSE-2.0> or the MIT license <LICENSE-MIT or
* http://opensource.org/licenses/MIT>, at your option. This file may not be
* copied, modified, or distributed except according to those terms.
*/
use crate::utf8_char::Utf8Char;
use crate::errors::EmptyStrError;
extern crate core;
use core::{u32, u64};
use core::ops::Not;
use core::fmt;
use core::borrow::Borrow;
#[cfg(feature="std")]
use std::io::{Read, Error as ioError};
/// Read or iterate over the bytes of the UTF-8 representation of a codepoint.
#[derive(Clone)]
pub struct Utf8Iterator (u32);
impl From<Utf8Char> for Utf8Iterator {
fn from(uc: Utf8Char) -> Self {
let used = u32::from_le_bytes(uc.to_array().0);
// uses u64 because shifting an u32 by 32 bits is a no-op.
let unused_set = (u64::MAX << (uc.len() as u64*8)) as u32;
Utf8Iterator(used | unused_set)
}
}
impl From<char> for Utf8Iterator {
fn from(c: char) -> Self {
Self::from(Utf8Char::from(c))
}
}
impl Iterator for Utf8Iterator {
type Item=u8;
fn next(&mut self) -> Option<u8> {
let next = self.0 as u8;
if next == 0xff {
None
} else {
self.0 = (self.0 >> 8) | 0xff_00_00_00;
Some(next)
}
}
fn size_hint(&self) -> (usize, Option<usize>) {
(self.len(), Some(self.len()))
}
}
impl ExactSizeIterator for Utf8Iterator {
fn len(&self) -> usize {// not straightforward, but possible
let unused_bytes = self.0.not().leading_zeros() / 8;
4 - unused_bytes as usize
}
}
#[cfg(feature="std")]
impl Read for Utf8Iterator {
/// Always returns Ok
fn read(&mut self, buf: &mut[u8]) -> Result<usize, ioError> {
// Cannot call self.next() until I know I can write the result.
for (i, dst) in buf.iter_mut().enumerate() {
match self.next() {
Some(b) => *dst = b,
None => return Ok(i),
}
}
Ok(buf.len())
}
}
impl fmt::Debug for Utf8Iterator {
fn fmt(&self, fmtr: &mut fmt::Formatter) -> fmt::Result {
let mut content = [0; 4];
let mut i = 0;
for b in self.clone() {
content[i] = b;
i += 1;
}
write!(fmtr, "{:?}", &content[..i])
}
}
/// Converts an iterator of `Utf8Char` (or `&Utf8Char`)
/// to an iterator of `u8`s.
///
/// Is equivalent to calling `.flatten()` or `.flat_map()` on the original iterator,
/// but the returned iterator is ~40% faster.
///
/// The iterator also implements `Read` (if the `std` feature isn't disabled).
/// Reading will never produce an error, and calls to `.read()` and `.next()`
/// can be mixed.
///
/// The exact number of bytes cannot be known in advance, but `size_hint()`
/// gives the possible range.
/// (min: all remaining characters are ASCII, max: all require four bytes)
///
/// # Examples
///
/// From iterator of values:
///
/// ```
/// use encode_unicode::{IterExt, CharExt};
///
/// let iterator = "foo".chars().map(|c| c.to_utf8() );
/// let mut bytes = [0; 4];
/// iterator.to_bytes().zip(&mut bytes).for_each(|(b,dst)| *dst = b );
/// assert_eq!(&bytes, b"foo\0");
/// ```
///
/// From iterator of references:
///
#[cfg_attr(feature="std", doc=" ```")]
#[cfg_attr(not(feature="std"), doc=" ```no_compile")]
/// use encode_unicode::{IterExt, CharExt, Utf8Char};
///
/// let chars: Vec<Utf8Char> = "💣 bomb 💣".chars().map(|c| c.to_utf8() ).collect();
/// let bytes: Vec<u8> = chars.iter().to_bytes().collect();
/// let flat_map: Vec<u8> = chars.iter().cloned().flatten().collect();
/// assert_eq!(bytes, flat_map);
/// ```
///
/// `Read`ing from it:
///
#[cfg_attr(feature="std", doc=" ```")]
#[cfg_attr(not(feature="std"), doc=" ```no_compile")]
/// use encode_unicode::{IterExt, CharExt};
/// use std::io::Read;
///
/// let s = "Ååh‽";
/// assert_eq!(s.len(), 8);
/// let mut buf = [b'E'; 9];
/// let mut reader = s.chars().map(|c| c.to_utf8() ).to_bytes();
/// assert_eq!(reader.read(&mut buf[..]).unwrap(), 8);
/// assert_eq!(reader.read(&mut buf[..]).unwrap(), 0);
/// assert_eq!(&buf[..8], s.as_bytes());
/// assert_eq!(buf[8], b'E');
/// ```
#[derive(Clone)]
pub struct Utf8CharSplitter<U:Borrow<Utf8Char>, I:Iterator<Item=U>> {
inner: I,
prev: u32,
}
impl<U:Borrow<Utf8Char>, I:IntoIterator<Item=U>>
From<I> for Utf8CharSplitter<U,I::IntoIter> {
fn from(iterable: I) -> Self {
Utf8CharSplitter { inner: iterable.into_iter(), prev: 0 }
}
}
impl<U:Borrow<Utf8Char>, I:Iterator<Item=U>> Utf8CharSplitter<U,I> {
/// Extracts the source iterator.
///
/// Note that `iter.into_inner().to_bytes()` is not a no-op:
/// If the last returned byte from `next()` was not an ASCII character,
/// the remaining bytes of that codepoint is lost.
pub fn into_inner(self) -> I {
self.inner
}
}
impl<U:Borrow<Utf8Char>, I:Iterator<Item=U>> Iterator for Utf8CharSplitter<U,I> {
type Item = u8;
fn next(&mut self) -> Option<Self::Item> {
if self.prev == 0 {
self.inner.next().map(|u8c| {
let array = u8c.borrow().to_array().0;
self.prev = u32::from_le_bytes(array) >> 8;
array[0]
})
} else {
let next = self.prev as u8;
self.prev >>= 8;
Some(next)
}
}
fn size_hint(&self) -> (usize,Option<usize>) {
// Doesn't need to handle unlikely overflows correctly because
// size_hint() cannot be relied upon anyway. (the trait isn't unsafe)
let (min, max) = self.inner.size_hint();
let add = 4 - (self.prev.leading_zeros() / 8) as usize;
(min.wrapping_add(add), max.map(|max| max.wrapping_mul(4).wrapping_add(add) ))
}
}
#[cfg(feature="std")]
impl<U:Borrow<Utf8Char>, I:Iterator<Item=U>> Read for Utf8CharSplitter<U,I> {
/// Always returns `Ok`
fn read(&mut self, buf: &mut[u8]) -> Result<usize, ioError> {
let mut i = 0;
// write remaining bytes of previous codepoint
while self.prev != 0 && i < buf.len() {
buf[i] = self.prev as u8;
self.prev >>= 8;
i += 1;
}
// write whole characters
while i < buf.len() {
let bytes = match self.inner.next() {
Some(u8c) => u8c.borrow().to_array().0,
None => break
};
buf[i] = bytes[0];
i += 1;
if bytes[1] != 0 {
let len = bytes[0].not().leading_zeros() as usize;
let mut written = 1;
while written < len {
if i < buf.len() {
buf[i] = bytes[written];
i += 1;
written += 1;
} else {
let bytes_as_u32 = u32::from_le_bytes(bytes);
self.prev = bytes_as_u32 >> (8*written);
return Ok(i);
}
}
}
}
Ok(i)
}
}
/// An iterator over the `Utf8Char` of a string slice, and their positions.
///
/// This struct is created by the `utf8char_indices()` method from [`StrExt`](../trait.StrExt.html)
/// trait. See its documentation for more.
#[derive(Clone)]
pub struct Utf8CharIndices<'a>{
str: &'a str,
index: usize,
}
impl<'a> From<&'a str> for Utf8CharIndices<'a> {
fn from(s: &str) -> Utf8CharIndices {
Utf8CharIndices{str: s, index: 0}
}
}
impl<'a> Utf8CharIndices<'a> {
/// Extract the remainder of the source `str`.
///
/// # Examples
///
/// ```
/// use encode_unicode::{StrExt, Utf8Char};
/// let mut iter = "abc".utf8char_indices();
/// assert_eq!(iter.next_back(), Some((2, Utf8Char::from('c'))));
/// assert_eq!(iter.next(), Some((0, Utf8Char::from('a'))));
/// assert_eq!(iter.as_str(), "b");
/// ```
pub fn as_str(&self) -> &'a str {
&self.str[self.index..]
}
}
impl<'a> Iterator for Utf8CharIndices<'a> {
type Item = (usize,Utf8Char);
fn next(&mut self) -> Option<(usize,Utf8Char)> {
match Utf8Char::from_str_start(&self.str[self.index..]) {
Ok((u8c, len)) => {
let item = (self.index, u8c);
self.index += len;
Some(item)
},
Err(EmptyStrError) => None
}
}
fn size_hint(&self) -> (usize,Option<usize>) {
let len = self.str.len() - self.index;
// For len+3 to overflow, the slice must fill all but two bytes of
// addressable memory, and size_hint() doesn't need to be correct.
(len.wrapping_add(3)/4, Some(len))
}
}
impl<'a> DoubleEndedIterator for Utf8CharIndices<'a> {
fn next_back(&mut self) -> Option<(usize,Utf8Char)> {
// Cannot refactor out the unwrap without switching to ::from_slice()
// since slicing the str panics if not on a boundary.
if self.index < self.str.len() {
let rev = self.str.bytes().rev();
let len = 1 + rev.take_while(|b| b & 0b1100_0000 == 0b1000_0000 ).count();
let starts = self.str.len() - len;
let (u8c,_) = Utf8Char::from_str_start(&self.str[starts..]).unwrap();
self.str = &self.str[..starts];
Some((starts, u8c))
} else {
None
}
}
}
impl<'a> fmt::Debug for Utf8CharIndices<'a> {
fn fmt(&self, fmtr: &mut fmt::Formatter) -> fmt::Result {
fmtr.debug_tuple("Utf8CharIndices")
.field(&self.index)
.field(&self.as_str())
.finish()
}
}
/// An iterator over the codepoints in a `str` represented as `Utf8Char`.
#[derive(Clone)]
pub struct Utf8Chars<'a>(Utf8CharIndices<'a>);
impl<'a> From<&'a str> for Utf8Chars<'a> {
fn from(s: &str) -> Utf8Chars {
Utf8Chars(Utf8CharIndices::from(s))
}
}
impl<'a> Utf8Chars<'a> {
/// Extract the remainder of the source `str`.
///
/// # Examples
///
/// ```
/// use encode_unicode::{StrExt, Utf8Char};
/// let mut iter = "abc".utf8chars();
/// assert_eq!(iter.next(), Some(Utf8Char::from('a')));
/// assert_eq!(iter.next_back(), Some(Utf8Char::from('c')));
/// assert_eq!(iter.as_str(), "b");
/// ```
pub fn as_str(&self) -> &'a str {
self.0.as_str()
}
}
impl<'a> Iterator for Utf8Chars<'a> {
type Item = Utf8Char;
fn next(&mut self) -> Option<Utf8Char> {
self.0.next().map(|(_,u8c)| u8c )
}
fn size_hint(&self) -> (usize,Option<usize>) {
self.0.size_hint()
}
}
impl<'a> DoubleEndedIterator for Utf8Chars<'a> {
fn next_back(&mut self) -> Option<Utf8Char> {
self.0.next_back().map(|(_,u8c)| u8c )
}
}
impl<'a> fmt::Debug for Utf8Chars<'a> {
fn fmt(&self, fmtr: &mut fmt::Formatter) -> fmt::Result {
fmtr.debug_tuple("Utf8CharIndices")
.field(&self.as_str())
.finish()
}
}

227
vendor/encode_unicode/tests/errs.rs vendored Normal file
View File

@@ -0,0 +1,227 @@
/* Copyright 2016-2022 Torbjørn Birch Moltu
* Copyright 2018 Aljoscha Meyer
*
* Licensed under the Apache License, Version 2.0, <LICENSE-APACHE or
* http://apache.org/licenses/LICENSE-2.0> or the MIT license <LICENSE-MIT or
* http://opensource.org/licenses/MIT>, at your option. This file may not be
* copied, modified, or distributed except according to those terms.
*/
//! Test that methods gives the correct error.
//! Some also test a bit more because it's easy.
extern crate core;
use core::char;
extern crate encode_unicode;
use encode_unicode::*;
use encode_unicode::error::*;
use encode_unicode::error::CodepointError::*;
use encode_unicode::error::Utf8ErrorKind::*;
#[test] fn from_u32() {
for c in 0xd800..0xe000 {
assert_eq!(char::from_u32_detailed(c), Err(Utf16Reserved));
}
let mut c = 0x11_00_00;
loop {
assert_eq!(char::from_u32_detailed(c), Err(TooHigh));
// Don't test every value. (Range.step_by() is unstable)
match c.checked_add(0x10_11_11) {
Some(next) => c = next,
None => break,
}
}
}
fn kind<T>(result: Result<T,Utf8Error>) -> Result<T,Utf8ErrorKind> {
result.map_err(|e| e.kind() )
}
#[test] fn utf8_extra_bytes() {
for c in 0..256 {
assert_eq!( kind((c as u8).extra_utf8_bytes()), match c {
0b_1000_0000..=0b_1011_1111 => Err(UnexpectedContinuationByte),
0b_1100_0000..=0b_1100_0001 => Err(NonUtf8Byte),
0b_1111_0101..=0b_1111_0111 => Err(NonUtf8Byte),
0b_1111_1000..=0b_1111_1111 => Err(NonUtf8Byte),
0b_0000_0000..=0b_0111_1111 => Ok(0),
0b_1100_0010..=0b_1101_1111 => Ok(1),
0b_1110_0000..=0b_1110_1111 => Ok(2),
0b_1111_0000..=0b_1111_0100 => Ok(3),
_ => unreachable!(),
});
}
for c in 0..256 {
assert_eq!((c as u8).extra_utf8_bytes_unchecked(), match c {
0b_0000_0000..=0b_0111_1111 => 0,
0b_1100_0000..=0b_1101_1111 => 1,
0b_1110_0000..=0b_1110_1111 => 2,
0b_1111_0000..=0b_1111_0111 => 3,
0b_1000_0000..=0b_1011_1111 => 0,
0b_1111_1111 => 7,
_ => continue,
});
}
}
#[test]
#[cfg_attr(miri, ignore)]
fn utf16_extra_unit() {
for c in 0..0x1_00_00 {
assert_eq!( (c as u16).utf16_needs_extra_unit(), match c {
0b_0000_0000_0000_0000..=0b_1101_0111_1111_1111 => Ok(false),
0b_1101_1000_0000_0000..=0b_1101_1011_1111_1111 => Ok(true),
0b_1101_1100_0000_0000..=0b_1101_1111_1111_1111 => Err(Utf16FirstUnitError),
0b_1110_0000_0000_0000..=0b_1111_1111_1111_1111 => Ok(false),
_ => unreachable!(),
});
}
}
#[test]
#[cfg_attr(miri, ignore)]
fn from_utf16_tuple() {
use encode_unicode::error::Utf16TupleError::*;
for u in 0xdc00..0xe000 {
let close = if u%3==0 {u-100} else {u+100};
let doesnt_matter = if u%2==0 {Some(close)} else {None};
assert_eq!(char::from_utf16_tuple((u,doesnt_matter)), Err(FirstIsTrailingSurrogate));
}
for u in (0..0xd800).chain(0xe000..0x10000) {
assert_eq!(
char::from_utf16_tuple((u as u16, Some((0x100+u) as u16))),
Err(SuperfluousSecond)
);
}
for u in 0xd800..0xdc00 {
assert_eq!(char::from_utf16_tuple((u,None)), Err(MissingSecond));
assert_eq!(char::from_utf16_tuple((u,Some(u - 0x2ff))), Err(SecondIsNotTrailingSurrogate));
}
}
#[test] fn from_utf16_slice_start() {
use encode_unicode::error::Utf16SliceError::*;
assert_eq!(char::from_utf16_slice_start(&[]), Err(EmptySlice));
let mut buf = [0; 6];
for u in 0xd800..0xdc00 {
buf[0] = u;
assert_eq!(char::from_utf16_slice_start(&buf[..1]), Err(MissingSecond));
buf[1] = u;
let pass = 2 + (u as usize % (buf.len()-2));
assert_eq!(char::from_utf16_slice_start(&buf[..pass]), Err(SecondIsNotTrailingSurrogate));
}
for u in 0xdc00..0xe000 {
buf[0] = u;
let close = if u%3==0 {u-100} else {u+100};
let pass = 1 + (u as usize % (buf.len()-1));
buf[pass] = close;
assert_eq!(char::from_utf16_slice_start(&buf[..pass]), Err(FirstIsTrailingSurrogate));
}
}
#[test] fn utf8_overlong() {
let overlongs = [
[0xf0,0x8f], [0xf0,0x87], [0xf0,0x80], // 4-byte
[0xe0,0x9f], [0xe0,0x8f], [0xe0,0x80], // 3-byte
];
for o in overlongs.iter() {
for &last in &[0x80, 0xbf] {
let arr = [o[0], o[1], last, last];
assert_eq!(kind(char::from_utf8_slice_start(&arr)), Err(OverlongEncoding));
assert_eq!(kind(char::from_utf8_array(arr)), Err(OverlongEncoding));
assert_eq!(kind(Utf8Char::from_slice_start(&arr)), Err(OverlongEncoding));
assert_eq!(kind(Utf8Char::from_array(arr)), Err(OverlongEncoding));
}
}
let non_utf8 = [
[0xc1,0xbf], [0xc1,0x92], [0xc1,0x80], // 2-byte
[0xc0,0xbf], [0xc0,0x9f], [0xc0,0x80], // 2-byte
];
for non in non_utf8.iter() {
for &last in &[0x80, 0xbf] {
let arr = [non[0], non[1], last, last];
assert_eq!(kind(char::from_utf8_slice_start(&arr)), Err(NonUtf8Byte));
assert_eq!(kind(char::from_utf8_array(arr)), Err(NonUtf8Byte));
assert_eq!(kind(Utf8Char::from_slice_start(&arr)), Err(NonUtf8Byte));
assert_eq!(kind(Utf8Char::from_array(arr)), Err(NonUtf8Byte));
}
}
}
#[test] fn from_str_start() {
assert_eq!(Utf8Char::from_str_start(""), Err(EmptyStrError));
assert_eq!(Utf16Char::from_str_start(""), Err(EmptyStrError));
}
#[test] fn utf8_codepoint_is_too_high() {
assert_eq!(kind(Utf8Char::from_array([0xf4, 0x90, 0x80, 0x80])), Err(TooHighCodepoint));
assert_eq!(kind(char::from_utf8_array([0xf4, 0x90, 0x80, 0x80])), Err(TooHighCodepoint));
assert_eq!(kind(Utf8Char::from_slice_start(&[0xf4, 0x90, 0x80, 0x80])), Err(TooHighCodepoint));
assert_eq!(kind(char::from_utf8_slice_start(&[0xf4, 0x90, 0x80, 0x80])), Err(TooHighCodepoint));
assert_eq!(kind(Utf8Char::from_array([0xf4, 0xa4, 0xb0, 0x9f])), Err(TooHighCodepoint));
assert_eq!(kind(char::from_utf8_array([0xf4, 0xa4, 0xb0, 0x9f])), Err(TooHighCodepoint));
assert_eq!(kind(Utf8Char::from_slice_start(&[0xf4, 0xa4, 0xb0, 0x9f])), Err(TooHighCodepoint));
assert_eq!(kind(char::from_utf8_slice_start(&[0xf4, 0xa4, 0xb8, 0x9f])), Err(TooHighCodepoint));
assert_eq!(kind(Utf8Char::from_array([0xf5, 0x88, 0x99, 0xaa])), Err(NonUtf8Byte));
assert_eq!(kind(char::from_utf8_array([0xf5, 0xaa, 0xbb, 0x88])), Err(NonUtf8Byte));
assert_eq!(kind(Utf8Char::from_slice_start(&[0xf5, 0x99, 0xaa, 0xbb])), Err(NonUtf8Byte));
assert_eq!(kind(char::from_utf8_slice_start(&[0xf5, 0xbb, 0x88, 0x99])), Err(NonUtf8Byte));
}
#[test] fn utf8_codepoint_is_utf16_reserved() {
assert_eq!(kind(Utf8Char::from_array([0xed, 0xa0, 0x80, 0xff])), Err(Utf16ReservedCodepoint));
assert_eq!(kind(char::from_utf8_array([0xed, 0xa0, 0x8f, 0x00])), Err(Utf16ReservedCodepoint));
assert_eq!(kind(Utf8Char::from_slice_start(&[0xed, 0xa0, 0xbe, 0xa5])), Err(Utf16ReservedCodepoint));
assert_eq!(kind(char::from_utf8_slice_start(&[0xed, 0xa0, 0xbf])), Err(Utf16ReservedCodepoint));
assert_eq!(kind(Utf8Char::from_array([0xed, 0xbf, 0x80, 0xff])), Err(Utf16ReservedCodepoint));
assert_eq!(kind(char::from_utf8_array([0xed, 0xbf, 0x8f, 0x00])), Err(Utf16ReservedCodepoint));
assert_eq!(kind(Utf8Char::from_slice_start(&[0xed, 0xbf, 0xbe, 0xa5])), Err(Utf16ReservedCodepoint));
assert_eq!(kind(char::from_utf8_slice_start(&[0xed, 0xbf, 0xbf])), Err(Utf16ReservedCodepoint));
}
#[test] fn utf8_first_is_continuation_byte() {
for first in 0x80..0xc0 {
let arr = [first, first<<2, first<<4, first<<6];
assert_eq!(kind(Utf8Char::from_array(arr)), Err(UnexpectedContinuationByte));
assert_eq!(kind(char::from_utf8_array(arr)), Err(UnexpectedContinuationByte));
let len = (1 + first%3) as usize;
assert_eq!(kind(Utf8Char::from_slice_start(&arr[..len])), Err(UnexpectedContinuationByte));
assert_eq!(kind(char::from_utf8_slice_start(&arr[..len])), Err(UnexpectedContinuationByte));
}
}
#[test] fn utf8_too_long() {
for first in 0xf8..0x100 {
let arr = [first as u8, 0x88, 0x80, 0x80];
assert_eq!(kind(Utf8Char::from_array(arr)), Err(NonUtf8Byte));
assert_eq!(kind(char::from_utf8_array(arr)), Err(NonUtf8Byte));
let arr = [first as u8, 0x88, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80];
let slice = &arr[..if first&1 == 0 {1} else {8}];
assert_eq!(kind(Utf8Char::from_slice_start(slice)), Err(NonUtf8Byte));
assert_eq!(kind(char::from_utf8_slice_start(slice)), Err(NonUtf8Byte));
}
}
#[test] fn utf8_not_continuation_byte() {
for first in 0xc2..0xf4 {
let mut arr = [first, 0x90, 0xa0, 0xb0];
let extra = first.extra_utf8_bytes().unwrap();
for corrupt in (1..extra).rev() {
for &bad in &[0x00, 0x3f, 0x40, 0x7f, 0xc0, 0xff] {
arr[corrupt] = bad;
assert_eq!(kind(Utf8Char::from_array(arr)), Err(InterruptedSequence), "{:?}", arr);
assert_eq!(kind(char::from_utf8_array(arr)), Err(InterruptedSequence));
let slice = if first&1 == 0 {&arr[..1+extra]} else {&arr};
assert_eq!(kind(Utf8Char::from_slice_start(slice)), Err(InterruptedSequence), "{:?}", slice);
assert_eq!(kind(char::from_utf8_slice_start(slice)), Err(InterruptedSequence));
}
}
}
}

View File

@@ -0,0 +1,35 @@
/* Copyright 2018-2022 Torbjørn Birch Moltu
*
* Licensed under the Apache License, Version 2.0, <LICENSE-APACHE or
* http://apache.org/licenses/LICENSE-2.0> or the MIT license <LICENSE-MIT or
* http://opensource.org/licenses/MIT>, at your option. This file may not be
* copied, modified, or distributed except according to those terms.
*/
//! Tests that try all possible values for at least one parameter / byte / unit
//! of the tested function.
use core::char;
extern crate encode_unicode;
use encode_unicode::*;
#[test]
fn from_ascii() {
for cp in 0u32..256 {
assert_eq!(Utf8Char::from_ascii(cp as u8).is_ok(), cp & 0x80 == 0);
if let Ok(u8c) = Utf8Char::from_ascii(cp as u8) {
assert_eq!(u8c, Utf8Char::from(cp as u8 as char));
}
}
}
#[test]
#[cfg_attr(miri, ignore)]
fn from_bmp() {
for cp in 0u32..0x1_00_00 {
assert_eq!(
Utf16Char::from_bmp(cp as u16).ok(),
char::from_u32(cp).map(Utf16Char::from)
);
}
}

186
vendor/encode_unicode/tests/iterators.rs vendored Normal file
View File

@@ -0,0 +1,186 @@
/* Copyright 2018-2022 Torbjørn Birch Moltu
*
* Licensed under the Apache License, Version 2.0, <LICENSE-APACHE or
* http://apache.org/licenses/LICENSE-2.0> or the MIT license <LICENSE-MIT or
* http://opensource.org/licenses/MIT>, at your option. This file may not be
* copied, modified, or distributed except according to those terms.
*/
//! Iterator tests
#![cfg(feature="std")]
#![allow(
clippy::needless_collect,// test oee thing at a time
)]
extern crate encode_unicode;
use encode_unicode::{IterExt, SliceExt, CharExt, Utf8Char};
use encode_unicode::iterator::Utf8CharSplitter;
use encode_unicode::error::Utf8ErrorKind::*;
use encode_unicode::error::Utf16PairError::*;
use std::io::Read;
use std::cmp::min;
#[test] fn utf8charmerger() {
let slice = b"\xf0\xa1\x92X\xcc\xbb";
let mut iter = slice.iter().to_utf8chars();
assert_eq!(iter.size_hint(), (1, Some(6)));
assert_eq!(format!("{:?}", &iter),
format!("Utf8CharMerger {{ buffered: [], inner: {:?} }}", slice.iter()));
assert_eq!(iter.next().map(|v| v.map_err(|e| e.kind() ) ), Some(Err(InterruptedSequence)));
assert_eq!(iter.size_hint(), (0, Some(5)));
assert_eq!(
format!("{:?}", &iter),
format!("Utf8CharMerger {{ buffered: [161, 146, 88], inner: {:?} }}", slice[4..].iter())
);
assert_eq!(iter.next().map(|v| v.map_err(|e| e.kind() ) ), Some(Err(UnexpectedContinuationByte)));
assert_eq!(iter.into_inner().next(), Some(&b'\xcc'));
}
#[test] fn utf8chardecoder() {
let slice = b"\xf4\xbf\x80\x80XY\xcc\xbbZ_";
let mut iter = slice.utf8char_indices();
assert_eq!(iter.size_hint(), (2, Some(10)));
assert_eq!(
format!("{:?}", &iter),
format!("Utf8CharDecoder {{ bytes[0..]: {:?} }}", &slice)
);
match iter.next() {
Some((0, Err(e), 1)) => assert_eq!(e.kind(), TooHighCodepoint),
wrong => panic!("Expected Some((0, Err(TooHighCodepoint), 1), got {:?}", wrong),
}
assert_eq!(
format!("{:?}", &iter),
format!("Utf8CharDecoder {{ bytes[1..]: {:?} }}", &slice[1..])
);
assert_eq!(iter.size_hint(), (2, Some(9)));
assert_eq!(iter.count(), 8);
}
#[test] fn utf16charmerger() {
let slice = [0xd800, 'x' as u16, 0xd900, 0xdfff, 'λ' as u16];
let mut iter = slice.iter().to_utf16chars();
assert_eq!(iter.size_hint(), (2, Some(5)));
assert_eq!(format!("{:?}", &iter),
format!("Utf16CharMerger {{ buffered: None, inner: {:?} }}", slice.iter()));
assert_eq!(iter.next(), Some(Err(UnmatchedLeadingSurrogate)));
assert_eq!(iter.size_hint(), (1, Some(4)));
assert_eq!(
format!("{:?}", &iter),
format!("Utf16CharMerger {{ buffered: Some(120), inner: {:?} }}", slice[2..].iter())
);
assert_eq!(iter.into_inner().next(), Some(&0xd900));
}
#[test] fn utf16chardecoder() {
let slice = [0xd800, 'x' as u16, 0xd900, 0xdfff, 'λ' as u16];
let mut iter = slice.utf16char_indices();
assert_eq!(iter.size_hint(), (2, Some(5)));
assert_eq!(
format!("{:?}", &iter),
format!("Utf16CharDecoder {{ units[0..]: {:?} }}", &slice)
);
assert_eq!(iter.next(), Some((0, Err(UnmatchedLeadingSurrogate), 1)));
assert_eq!(
format!("{:?}", &iter),
format!("Utf16CharDecoder {{ units[1..]: {:?} }}", &slice[1..])
);
assert_eq!(iter.size_hint(), (2, Some(4)));
assert_eq!(iter.count(), 3);
}
/// Tests for ensuring that iterators which also implement Read support
/// interleaving calls of `read()` and `next()`, and that they implement Read
/// correctly (support any buffer size at any time).
#[test] fn read_single_ascii() {
let uc = 'a'.to_utf8();
assert_eq!(uc.len(), 1);
for chunk in 1..5 {
let mut buf = [b'E'; 6];
let mut iter = uc.into_iter();
let mut written = 0;
for _ in 0..4 {
assert_eq!(iter.read(&mut buf[..0]).unwrap(), 0);
let wrote = iter.read(&mut buf[written..written+chunk]).unwrap();
assert_eq!(wrote, min(1-written, chunk));
written += wrote;
for &b in &buf[written..] {assert_eq!(b, b'E');}
assert_eq!(buf[..written], AsRef::<[u8]>::as_ref(&uc)[..written]);
}
assert_eq!(written, 1);
}
}
#[test] fn read_single_nonascii() {
let uc = 'ä'.to_utf8();
assert_eq!(uc.len(), 2);
for chunk in 1..5 {
let mut buf = [b'E'; 6];
let mut iter = uc.into_iter();
let mut written = 0;
for _ in 0..4 {
assert_eq!(iter.read(&mut buf[..0]).unwrap(), 0);
let wrote = iter.read(&mut buf[written..written+chunk]).unwrap();
assert_eq!(wrote, min(2-written, chunk));
written += wrote;
for &b in &buf[written..] {assert_eq!(b, b'E');}
assert_eq!(buf[..written], AsRef::<[u8]>::as_ref(&uc)[..written]);
}
assert_eq!(written, 2);
}
}
#[test] fn utf8charsplitter_read_all_sizes() {
let s = "1111\u{104444}\u{222}1\u{833}1111\u{100004}";
assert!(s.len()%3 == 1);
let mut buf = vec![b'E'; s.len()+6];
for size in 2..6 {//s.len()+4 {
let mut reader = Utf8CharSplitter::from(s.chars().map(|c| c.to_utf8() ));
for (offset, part) in s.as_bytes().chunks(size).enumerate() {
let read_to = if part.len() == size {(offset+1)*size} else {buf.len()};
assert_eq!(reader.read(&mut buf[offset*size..read_to]).unwrap(), part.len());
assert_eq!(&buf[..offset*size+part.len()], &s.as_bytes()[..offset*size+part.len()]);
}
assert_eq!(reader.read(&mut buf[..]).unwrap(), 0);
assert!(buf[s.len()..].iter().all(|&b| b==b'E' ));
}
}
#[test] fn utf8charsplitter_alternate_iter_read() {
let s = "1111\u{104444}\u{222}1\u{833}1111\u{100004}";
let mut buf = [b'0'; 10];
for n in 0..2 {
// need to collect to test size_hint()
// because chars().size_hint() returns ((bytes+3)/4, Some(bytes))
let u8chars = s.chars().map(|c| c.to_utf8() ).collect::<Vec<Utf8Char>>();
let mut iter = Utf8CharSplitter::from(u8chars.into_iter());
for (i, byte) in s.bytes().enumerate() {
let until_next = s.as_bytes()[i..].iter().take_while(|&b| (b>>6)==0b10u8 ).count();
let remaining_chars = s[i+until_next..].chars().count();
println!("{}. run: byte {:02} of {}, remaining: {:02}+{}: 0b{:08b} = {:?}",
n, i, s.len(), remaining_chars, until_next, byte, byte as char);
assert_eq!(iter.read(&mut[][..]).unwrap(), 0);
if i % 2 == n {
assert_eq!(iter.next(), Some(byte));
} else {
assert_eq!(iter.read(&mut buf[..1]).unwrap(), 1);
assert_eq!(buf[0], byte);
}
}
assert_eq!(iter.size_hint(), (0, Some(0)));
assert_eq!(iter.next(), None);
assert_eq!(iter.read(&mut buf[..]).unwrap(), 0);
}
}

311
vendor/encode_unicode/tests/oks.rs vendored Normal file
View File

@@ -0,0 +1,311 @@
/* Copyright 2016-2022 Torbjørn Birch Moltu
*
* Licensed under the Apache License, Version 2.0, <LICENSE-APACHE or
* http://apache.org/licenses/LICENSE-2.0> or the MIT license <LICENSE-MIT or
* http://opensource.org/licenses/MIT>, at your option. This file may not be
* copied, modified, or distributed except according to those terms.
*/
//! Test that every method gives the correct result for valid values.
//! Except iterators, which are stateful.
#![cfg(feature="std")]
#![allow(
clippy::eq_op, // testing the comparison
)]
use std::char;
use std::str::{self,FromStr};
use std::cmp::Ordering;
use std::hash::{Hash,Hasher};
use std::collections::hash_map::DefaultHasher;
use std::iter::FromIterator;
extern crate encode_unicode;
use encode_unicode::*;
#[test]
fn equal_defaults() {
assert_eq!(Utf8Char::default().to_char(), char::default());
assert_eq!(Utf16Char::default().to_char(), char::default());
}
#[test]
fn same_size_as_char() {
use std::mem::size_of;
assert_eq!(size_of::<Utf8Char>(), size_of::<char>());
assert_eq!(size_of::<Utf16Char>(), size_of::<char>());
}
#[test]
fn utf16chars_to_string() {
let s = "\u{10ffff}\u{100000}\u{fee1}";
let u16cs = s.chars().map(Utf16Char::from).collect::<Vec<Utf16Char>>();
let mut from_refs: String = u16cs.iter().collect();
assert_eq!(&from_refs, s);
from_refs.extend(&u16cs);
assert_eq!(&from_refs[s.len()..], s);
let mut from_vals: String = u16cs.iter().cloned().collect();
assert_eq!(&from_vals, s);
from_vals.extend(u16cs);
assert_eq!(&from_vals[s.len()..], s);
}
const EDGES_AND_BETWEEN: [char;19] = [
'\u{0}',// min
'\u{3b}',// middle ASCII
'A',// min ASCII uppercase
'N',// middle ASCII uppercase
'Z',// max ASCII uppercase
'a',// min ASCII lowercase
'm',// middle ASCII lowercase
'z',// max ASCII lowercase
'\u{7f}',// max ASCII and 1-byte UTF-8
'\u{80}',// min 2-byte UTF-8
'\u{111}',// middle
'\u{7ff}',// max 2-byte UTF-8
'\u{800}',// min 3-byte UTF-8
'\u{d7ff}',// before reserved
'\u{e000}',// after reserved
'\u{ffff}',// max UTF-16 single and 3-byte UTF-8
'\u{10000}',// min UTF-16 surrogate and 4-byte UTF-8
'\u{abcde}',// middle
'\u{10ffff}',// max
];
fn eq_cmp_hash(c: char) -> (Utf8Char, Utf16Char) {
fn hash<T:Hash>(v: T) -> u64 {
#[allow(deprecated)]
let mut hasher = DefaultHasher::new();
v.hash(&mut hasher);
hasher.finish()
}
let u8c = c.to_utf8();
assert_eq!(u8c.to_char(), c);
assert_eq!(u8c, u8c);
assert_eq!(hash(u8c), hash(u8c));
assert_eq!(u8c.cmp(&u8c), Ordering::Equal);
assert!(u8c.eq_ignore_ascii_case(&u8c));
let u16c = c.to_utf16();
assert_eq!(u16c.to_char(), c);
assert_eq!(u16c, u16c);
assert_eq!(hash(u16c), hash(c));
assert_eq!(u16c.cmp(&u16c), Ordering::Equal);
assert!(u16c.eq_ignore_ascii_case(&u16c));
assert_eq!(u8c, c);
assert_eq!(c, u8c);
assert_eq!(u16c, c);
assert_eq!(c, u16c);
assert_eq!(u8c, u16c);
assert_eq!(u16c, u8c);
assert_eq!(u8c == c as u8, c <= '\u{7F}');
assert_eq!(u16c == c as u8, c <= '\u{FF}');
assert_eq!(u16c == c as u16, c <= '\u{FFFF}');
assert_eq!(u8c.partial_cmp(&c), Some(Ordering::Equal));
assert_eq!(c.partial_cmp(&u8c), Some(Ordering::Equal));
assert_eq!(u16c.partial_cmp(&c), Some(Ordering::Equal));
assert_eq!(c.partial_cmp(&u16c), Some(Ordering::Equal));
assert_eq!(u8c.partial_cmp(&u16c), Some(Ordering::Equal));
assert_eq!(u16c.partial_cmp(&u8c), Some(Ordering::Equal));
for &other in &EDGES_AND_BETWEEN {
let u8other = other.to_utf8();
assert_eq!(u8c == u8other, c == other);
assert_eq!(hash(u8c)==hash(u8other), hash(c)==hash(other));
assert_eq!(u8c.cmp(&u8other), c.cmp(&other));
assert_eq!(u8c.eq_ignore_ascii_case(&u8other), c.eq_ignore_ascii_case(&other));
assert_eq!(u8c.partial_cmp(&other), c.partial_cmp(&other));
assert_eq!(c.partial_cmp(&u8other), c.partial_cmp(&other));
assert_eq!(u8other.partial_cmp(&c), other.partial_cmp(&c));
assert_eq!(other.partial_cmp(&u8c), other.partial_cmp(&c));
assert_eq!(u8c == other as u8, other as u8 <= 127 && c == other as u8 as char);
let u16other = other.to_utf16();
assert_eq!(u16c == u16other, c == other);
assert_eq!(hash(u16c)==hash(u16other), hash(c)==hash(other));
assert_eq!(u16c.cmp(&u16other), c.cmp(&other));
assert_eq!(u16c.eq_ignore_ascii_case(&u16other), c.eq_ignore_ascii_case(&other));
assert_eq!(u16c.partial_cmp(&other), c.partial_cmp(&other));
assert_eq!(c.partial_cmp(&u16other), c.partial_cmp(&other));
assert_eq!(u16other.partial_cmp(&c), other.partial_cmp(&c));
assert_eq!(other.partial_cmp(&u16c), other.partial_cmp(&c));
assert_eq!(u16c == other as u8, c == other as u8 as char);
assert_eq!(u16c == other as u16, c as u32 == other as u16 as u32);
assert_eq!(u8c == u16other, c == other);
assert_eq!(u16c == u8other, c == other);
assert_eq!(u8c.partial_cmp(&u16other), c.partial_cmp(&other));
assert_eq!(u16c.partial_cmp(&u8other), c.partial_cmp(&other));
assert_eq!(u8other.partial_cmp(&u16c), other.partial_cmp(&c));
assert_eq!(u16other.partial_cmp(&u8c), other.partial_cmp(&c));
}
(u8c, u16c)
}
fn iterators(c: char) {
let mut iter = c.iter_utf8_bytes();
let mut buf = [0; 4];
let mut iter_ref = c.encode_utf8(&mut buf[..]).as_bytes().iter();
for _ in 0..6 {
assert_eq!(iter.size_hint(), iter_ref.size_hint());
assert_eq!(format!("{:?}", iter), format!("{:?}", iter_ref.as_slice()));
assert_eq!(iter.next(), iter_ref.next().cloned());
}
let mut iter = c.iter_utf16_units();
let mut buf = [0; 2];
let mut iter_ref = c.encode_utf16(&mut buf[..]).iter();
for _ in 0..4 {
assert_eq!(iter.size_hint(), iter_ref.size_hint());
assert_eq!(format!("{:?}", iter), format!("{:?}", iter_ref.as_slice()));
assert_eq!(iter.next(), iter_ref.next().cloned());
}
}
fn test(c: char) {
assert_eq!(char::from_u32(c as u32), Some(c));
assert_eq!(char::from_u32_detailed(c as u32), Ok(c));
assert_eq!(unsafe{ char::from_u32_unchecked(c as u32) }, c);
let (u8c, u16c) = eq_cmp_hash(c);
iterators(c);
assert_eq!(Utf16Char::from(u8c), u16c);
assert_eq!(Utf8Char::from(u16c), u8c);
let utf8_len = c.len_utf8();
let utf16_len = c.len_utf16();
let mut as_str = c.to_string();
// UTF-8
let mut buf = [0; 4];
let reference = c.encode_utf8(&mut buf[..]).as_bytes();
let len = reference.len(); // short name because it is used in many places.
assert_eq!(len, utf8_len);
assert_eq!(reference[0].extra_utf8_bytes(), Ok(len-1));
assert_eq!(reference[0].extra_utf8_bytes_unchecked(), len-1);
assert_eq!(AsRef::<[u8]>::as_ref(&u8c), reference);
let (arr,arrlen) = u8c.to_array();
assert_eq!(arrlen, len);
assert_eq!(Utf8Char::from_array(arr), Ok(u8c));
assert_eq!(Utf8Char::new(c), u8c);
assert_eq!(c.to_utf8_array(), (arr, len));
let str_ = str::from_utf8(reference).unwrap();
let ustr = Utf8Char::from_str(str_).unwrap();
assert_eq!(ustr.to_array().0, arr);// bitwise equality
assert_eq!(char::from_utf8_array(arr), Ok(c));
let mut longer = [0xff; 5]; // 0xff is never valid
longer[..len].copy_from_slice(reference);
assert_eq!(char::from_utf8_slice_start(reference), Ok((c,len)));
assert_eq!(char::from_utf8_slice_start(&longer), Ok((c,len)));
assert_eq!(Utf8Char::from_slice_start(reference), Ok((u8c,len)));
assert_eq!(Utf8Char::from_slice_start(&longer), Ok((u8c,len)));
for other in &mut longer[len..] {*other = b'?'}
assert_eq!(Utf8Char::from_str(str_), Ok(u8c));
assert_eq!(Utf8Char::from_str_start(str_), Ok((u8c,len)));
assert_eq!(Utf8Char::from_str_start(str::from_utf8(&longer).unwrap()), Ok((u8c,len)));
unsafe {
// Hopefully make bugs easier to catch by making reads into unallocated memory by filling
// a jemalloc bin. See table on http://jemalloc.net/jemalloc.3.html for bin sizes.
// I have no idea whether this works.
let mut boxed = Box::new([0xffu8; 16]);
let start = boxed.len()-len; // reach the end
boxed[start..].copy_from_slice(reference);
let slice = &boxed[start..];
assert_eq!(Utf8Char::from_slice_start_unchecked(slice), (u8c,len));
}
assert_eq!(&Vec::<u8>::from_iter(Some(u8c))[..], reference);
assert_eq!(&String::from_iter(Some(u8c))[..], str_);
assert_eq!(format!("{:?}", u8c), format!("{:?}", c));
assert_eq!(format!("{}", u8c), format!("{}", c));
assert_eq!(u8c.is_ascii(), c.is_ascii());
assert_eq!(u8c.to_ascii_lowercase().to_char(), c.to_ascii_lowercase());
assert_eq!(u8c.to_ascii_uppercase().to_char(), c.to_ascii_uppercase());
// UTF-16
let mut buf = [0; 2];
let reference = c.encode_utf16(&mut buf[..]);
let len = reference.len();
assert_eq!(len, utf16_len);
assert_eq!(reference[0].utf16_needs_extra_unit(), Ok(len==2));
assert_eq!(reference[0].is_utf16_leading_surrogate(), len==2);
assert_eq!(u16c.as_ref(), reference);
assert_eq!(Utf16Char::new(c), u16c);
let mut longer = [0; 3];
longer[..len].copy_from_slice(reference);
assert_eq!(char::from_utf16_slice_start(reference), Ok((c,len)));
assert_eq!(char::from_utf16_slice_start(&longer), Ok((c,len)));
assert_eq!(Utf16Char::from_slice_start(reference), Ok((u16c,len)));
assert_eq!(Utf16Char::from_slice_start(&longer), Ok((u16c,len)));
assert_eq!(Utf16Char::from_str(&as_str), Ok(u16c));
as_str.push(c);
assert_eq!(Utf16Char::from_str_start(&as_str), Ok((u16c,utf8_len)));
unsafe {
// Hopefully make bugs easier to catch by making reads into unallocated memory by filling
// a jemalloc bin. See table on http://jemalloc.net/jemalloc.3.html for bin sizes.
// I have no idea whether this works.
let mut boxed = Box::new([0u16; 8]);
let start = boxed.len()-len; // reach the end
boxed[start..].copy_from_slice(reference);
let slice = &boxed[start..];
assert_eq!(Utf16Char::from_slice_start_unchecked(slice), (u16c,len));
}
let array = c.to_utf16_array();
let tuple = c.to_utf16_tuple();
assert_eq!(&array[..reference.len()], reference);
assert_eq!(tuple, (reference[0],reference.get(1).cloned()));
assert_eq!(char::from_utf16_array(array), Ok(c));
assert_eq!(char::from_utf16_tuple(tuple), Ok(c));
assert_eq!(c.to_utf16().to_char(), c);
assert_eq!(&Vec::<u16>::from_iter(Some(u16c))[..], reference);
assert_eq!(format!("{:?}", u16c), format!("{:?}", c));
assert_eq!(format!("{}", u16c), format!("{}", c));
assert_eq!(u16c.is_ascii(), c.is_ascii());
assert_eq!(u16c.to_ascii_lowercase().to_char(), c.to_ascii_lowercase());
assert_eq!(u16c.to_ascii_uppercase().to_char(), c.to_ascii_uppercase());
}
#[test]
fn edges_and_middle() {
for &c in &EDGES_AND_BETWEEN {
test(c);
}
}
// Test EVERY codepoint.
// By splitting into multiple tests we get multithreading for free.
macro_rules! test_codepoint_range {($name:ident, $range:expr) => {
#[test]
#[ignore]
fn $name() {
for cp in $range {
let c = char::from_u32(cp).expect("not a valid char");
test(c);
}
}
}}
test_codepoint_range!{all_0000_d800, 0x0000..0xd800}
test_codepoint_range!{all_e000_10000, 0xe000..0x10000}
test_codepoint_range!{all_10000_20000, 0x10000..0x20000}
test_codepoint_range!{all_20000_30000, 0x20000..0x30000}
test_codepoint_range!{all_30000_40000, 0x30000..0x40000}
test_codepoint_range!{all_40000_50000, 0x40000..0x50000}
test_codepoint_range!{all_50000_60000, 0x50000..0x60000}
test_codepoint_range!{all_60000_70000, 0x60000..0x70000}
test_codepoint_range!{all_70000_80000, 0x70000..0x80000}
test_codepoint_range!{all_80000_90000, 0x80000..0x90000}
test_codepoint_range!{all_90000_a0000, 0x90000..0xa0000}
test_codepoint_range!{all_a0000_b0000, 0xa0000..0xb0000}
test_codepoint_range!{all_b0000_c0000, 0xb0000..0xc0000}
test_codepoint_range!{all_c0000_d0000, 0xc0000..0xd0000}
test_codepoint_range!{all_d0000_e0000, 0xd0000..0xe0000}
test_codepoint_range!{all_e0000_f0000, 0xe0000..0xf0000}
test_codepoint_range!{all_f0000_100000, 0xf0000..0x100000}
test_codepoint_range!{all_100000_110000, 0x100000..0x110000}