283 lines
12 KiB
Rust
283 lines
12 KiB
Rust
/* Copyright 2018-2022 Torbjørn Birch Moltu
|
|
*
|
|
* Licensed under the Apache License, Version 2.0, <LICENSE-APACHE or
|
|
* http://apache.org/licenses/LICENSE-2.0> or the MIT license <LICENSE-MIT or
|
|
* http://opensource.org/licenses/MIT>, at your option. This file may not be
|
|
* copied, modified, or distributed except according to those terms.
|
|
*/
|
|
|
|
// Run with -- --nocapture to show error messages if setup fails.
|
|
// (or use ./do.sh)
|
|
|
|
#![cfg(feature="std")]
|
|
#![feature(test)]
|
|
extern crate test;
|
|
use test::{Bencher, black_box};
|
|
|
|
use std::fs;
|
|
use std::path::Path;
|
|
use std::io::ErrorKind;
|
|
use std::thread::sleep;
|
|
use std::time::Duration;
|
|
use std::collections::HashMap;
|
|
extern crate minreq;
|
|
#[macro_use] extern crate lazy_static;
|
|
extern crate encode_unicode;
|
|
use encode_unicode::{CharExt, Utf8Char, U8UtfExt, Utf16Char, U16UtfExt};
|
|
|
|
// Setup; need longish strings to make benchmarks representative and
|
|
// reduce overhead (might get cache misses now though)
|
|
// Therefore we download a few wikipedia articles in different languages.
|
|
// Downloading a fixed revision of the articles doesn't prevent the HTML from
|
|
// changing due to changes in templates or rendering.
|
|
|
|
fn load_wikipedia(language: &str, article: &str, english: &str, revision: usize) -> String {
|
|
let cache_path = Path::new("benches").join("texts");
|
|
let cache_path = cache_path.to_str().unwrap();
|
|
let name = format!("{}_{}.html", language, english);
|
|
let path = Path::new(cache_path).join(&name);
|
|
let path = path.to_str().unwrap();
|
|
match fs::read_to_string(path) {
|
|
Ok(content) => return content,
|
|
Err(ref e) if e.kind() == ErrorKind::NotFound => {},//continue
|
|
Err(ref e) if e.kind() == ErrorKind::InvalidData => {
|
|
panic!("{} exists but is not UTF-8", &name);
|
|
},
|
|
Err(e) => panic!("{} exists but cannot be read ({})", path, e),
|
|
}
|
|
let mut article_ascii = String::new();
|
|
for c in article.chars() {
|
|
if c.is_ascii() {
|
|
article_ascii.push(c);
|
|
} else {
|
|
let encoded = format!("%{:2X}", c as u32);
|
|
article_ascii.push_str(encoded.as_str());
|
|
}
|
|
}
|
|
let url = format!("https://{}.m.wikipedia.org/w/index.php?title={}&oldid={}",
|
|
language, article_ascii, revision
|
|
);
|
|
println!("Downloading {} and saving to {}", &url, path);
|
|
let response = minreq::get(&url).send().unwrap_or_else(|e| {
|
|
panic!("Cannot get {}: {}", url, e);
|
|
});
|
|
if response.status_code != 200 {
|
|
panic!("Bad URL {}: {} {}", url, response.status_code, response.reason_phrase);
|
|
}
|
|
let content = String::from_utf8(response.into_bytes()).unwrap_or_else(|_| {
|
|
panic!("Response from {} is not UTF-8", url);
|
|
});
|
|
if let Err(e) = fs::create_dir_all(cache_path) {
|
|
eprintln!("Warning: failed to create directory {}: {}", cache_path, e);
|
|
} else if let Err(e) = fs::write(&path, &content) {
|
|
eprintln!("Warning: failed to save {}: {}", path, e);
|
|
}
|
|
sleep(Duration::from_secs(1));
|
|
content
|
|
}
|
|
const ARTICLES: &[(&str, &str, &str, usize)] = &[
|
|
("en", "United_Kingdom", "United_Kingdom", 855522252),// 99,7% ASCII
|
|
("es", "España", "Spain", 109861222),// 1,75% 2-byte characters
|
|
("ru", "Россия", "Russia", 94607243),// 36% 2-byte characters
|
|
("zh", "中國", "China", 50868604),// 30% 3-byte characters
|
|
];
|
|
lazy_static!{
|
|
static ref STRINGS: HashMap<&'static str, String> = {
|
|
let mut content = HashMap::new();
|
|
for &(language, article, english, revision) in ARTICLES {
|
|
content.insert(language, load_wikipedia(language, article, english, revision));
|
|
}
|
|
// make one string with only ASCII
|
|
let only_ascii = content.values()
|
|
.map(|v| (v, v.bytes().filter(|b| b.is_ascii() ).count()) )
|
|
.max_by_key(|&(_,len)| len )
|
|
.map(|(v,_)| v.bytes().filter(|b| b.is_ascii() ).map(|b| b as char ).collect() )
|
|
.unwrap();
|
|
content.insert("ascii", only_ascii);
|
|
content
|
|
};
|
|
static ref EQUAL_CHARS: HashMap<&'static str, &'static str> = {
|
|
let (least, chars) = STRINGS.iter()
|
|
.map(|(l,s)| (l, s.chars().count()) )
|
|
.min_by_key(|&(_,chars)| chars )
|
|
.unwrap();
|
|
println!("chars: {} (limited by {})", chars, least);
|
|
STRINGS.iter().map(|(&language, string)| {
|
|
let cut = string.char_indices()
|
|
.nth(chars)
|
|
.map_or(string.len(), |(i,_)| i );
|
|
let string = &string[..cut];
|
|
assert_eq!(string.chars().count(), chars);
|
|
(language, string)
|
|
}).collect()
|
|
};
|
|
static ref EQUAL_BYTES: HashMap<&'static str, String> = {
|
|
let (least, bytes) = STRINGS.iter()
|
|
.map(|(l,s)| (l, s.len()) )
|
|
.min_by_key(|&(_,bytes)| bytes )
|
|
.unwrap();
|
|
println!("bytes: {} (limited by {})", bytes, least);
|
|
STRINGS.iter().map(|(&language, string)| {
|
|
let mut remaining = bytes;
|
|
// take just so many characters that their length is exactly $bytes
|
|
// slicing won't if !string.is_char_boundary(bytes),
|
|
let string = string.chars().filter(|c| {
|
|
match remaining.checked_sub(c.len_utf8()) {
|
|
Some(after) => {remaining = after; true},
|
|
None => false
|
|
}
|
|
}).collect::<String>();
|
|
assert_eq!(string.len(), bytes);
|
|
(language, string)
|
|
}).collect()
|
|
};
|
|
static ref EQUAL_UNITS: HashMap<&'static str, String> = {
|
|
let (least, units) = STRINGS.iter()
|
|
.map(|(l,s)| (l, s.chars().map(|c| c.len_utf16() ).sum::<usize>()) )
|
|
.min_by_key(|&(_,units)| units )
|
|
.unwrap();
|
|
println!("units: {} (limited by {})", units, least);
|
|
STRINGS.iter().map(|(&language, string)| {
|
|
let mut remaining = units;
|
|
let string = string.chars().filter(|c| {
|
|
match remaining.checked_sub(c.len_utf16()) {
|
|
Some(after) => {remaining = after; true},
|
|
None => false
|
|
}
|
|
}).collect::<String>();
|
|
assert_eq!(string.chars().map(|c| c.len_utf16() ).sum::<usize>(), units);
|
|
(language, string)
|
|
}).collect()
|
|
};
|
|
}
|
|
|
|
|
|
|
|
///////////////////////////
|
|
// benchmarks begin here //
|
|
///////////////////////////
|
|
|
|
fn utf8char_len(language: &str, b: &mut Bencher) {
|
|
let string = &EQUAL_BYTES[language];
|
|
let chars: Vec<Utf8Char> = string.chars().map(|c| c.to_utf8() ).collect();
|
|
let bytes = string.len();
|
|
b.iter(|| {
|
|
let sum: usize = black_box(&chars).iter().map(|u8c| u8c.len() ).sum();
|
|
assert_eq!(sum, bytes);
|
|
});
|
|
}
|
|
#[bench] fn utf8char_len_ascii(b: &mut Bencher) {utf8char_len("ascii", b)}
|
|
#[bench] fn utf8char_len_en(b: &mut Bencher) {utf8char_len("en", b)}
|
|
#[bench] fn utf8char_len_es(b: &mut Bencher) {utf8char_len("es", b)}
|
|
#[bench] fn utf8char_len_ru(b: &mut Bencher) {utf8char_len("ru", b)}
|
|
#[bench] fn utf8char_len_zh(b: &mut Bencher) {utf8char_len("zh", b)}
|
|
|
|
fn utf8_extra_bytes_unchecked(language: &str, b: &mut Bencher) {
|
|
let string = &EQUAL_CHARS[language];
|
|
let chars = string.chars().count();
|
|
let string = string.as_bytes();
|
|
b.iter(|| {
|
|
let mut i = 0;
|
|
let mut loops = 0;
|
|
while i < string.len() {
|
|
i += string[i].extra_utf8_bytes_unchecked();
|
|
i += 1;
|
|
loops += 1;
|
|
}
|
|
assert_eq!(loops, chars);
|
|
});
|
|
}
|
|
#[bench] fn utf8_extra_bytes_unchecked_ascii(b: &mut Bencher) {utf8_extra_bytes_unchecked("ascii", b)}
|
|
#[bench] fn utf8_extra_bytes_unchecked_en(b: &mut Bencher) {utf8_extra_bytes_unchecked("en", b)}
|
|
#[bench] fn utf8_extra_bytes_unchecked_es(b: &mut Bencher) {utf8_extra_bytes_unchecked("es", b)}
|
|
#[bench] fn utf8_extra_bytes_unchecked_ru(b: &mut Bencher) {utf8_extra_bytes_unchecked("ru", b)}
|
|
#[bench] fn utf8_extra_bytes_unchecked_zh(b: &mut Bencher) {utf8_extra_bytes_unchecked("zh", b)}
|
|
|
|
fn utf8_extra_bytes(language: &str, b: &mut Bencher) {
|
|
let string = &EQUAL_CHARS[language];
|
|
let chars = string.chars().count();
|
|
let string = string.as_bytes();
|
|
b.iter(|| {
|
|
let mut i = 0;
|
|
let mut loops = 0;
|
|
let mut errors = 0;
|
|
while i < string.len() {
|
|
match string[i].extra_utf8_bytes() {
|
|
Ok(n) => i += n,
|
|
Err(_) => errors += 1,
|
|
}
|
|
i += 1;
|
|
loops += 1;
|
|
}
|
|
assert_eq!(loops, chars);
|
|
assert_eq!(errors, 0);
|
|
});
|
|
}
|
|
#[bench] fn utf8_extra_bytes_ascii(b: &mut Bencher) {utf8_extra_bytes("ascii", b)}
|
|
#[bench] fn utf8_extra_bytes_en(b: &mut Bencher) {utf8_extra_bytes("en", b)}
|
|
#[bench] fn utf8_extra_bytes_es(b: &mut Bencher) {utf8_extra_bytes("es", b)}
|
|
#[bench] fn utf8_extra_bytes_ru(b: &mut Bencher) {utf8_extra_bytes("ru", b)}
|
|
#[bench] fn utf8_extra_bytes_zh(b: &mut Bencher) {utf8_extra_bytes("zh", b)}
|
|
|
|
|
|
fn utf16char_len(language: &str, b: &mut Bencher) {
|
|
let string = &EQUAL_UNITS[language];
|
|
let chars: Vec<Utf16Char> = string.chars().map(|c| c.to_utf16() ).collect();
|
|
let units = string.chars().map(|c| c.len_utf16() ).sum::<usize>();
|
|
b.iter(|| {
|
|
let sum: usize = black_box(&chars).iter().map(|u8c| u8c.len() ).sum();
|
|
assert_eq!(sum, units);
|
|
});
|
|
}
|
|
#[bench] fn utf16char_len_ascii(b: &mut Bencher) {utf16char_len("ascii", b)}
|
|
#[bench] fn utf16char_len_en(b: &mut Bencher) {utf16char_len("en", b)}
|
|
#[bench] fn utf16char_len_es(b: &mut Bencher) {utf16char_len("en", b)}
|
|
#[bench] fn utf16char_len_ru(b: &mut Bencher) {utf16char_len("ru", b)}
|
|
#[bench] fn utf16char_len_zh(b: &mut Bencher) {utf16char_len("zh", b)}
|
|
|
|
fn utf16_is_leading_surrogate(language: &str, b: &mut Bencher) {
|
|
let string = &EQUAL_UNITS[language];
|
|
let chars = string.chars().count();
|
|
let string: Vec<u16> = string.chars().map(|c| c.to_utf16() ).collect();
|
|
b.iter(|| {
|
|
let mut i = 0;
|
|
let mut loops = 0;
|
|
while i < string.len() {
|
|
i += if string[i].is_utf16_leading_surrogate() {2} else {1};
|
|
loops += 1;
|
|
}
|
|
assert_eq!(loops, chars);
|
|
});
|
|
}
|
|
#[bench] fn utf16_is_leading_surrogate_ascii(b: &mut Bencher) {utf16_is_leading_surrogate("ascii", b)}
|
|
#[bench] fn utf16_is_leading_surrogate_en(b: &mut Bencher) {utf16_is_leading_surrogate("en", b)}
|
|
#[bench] fn utf16_is_leading_surrogate_es(b: &mut Bencher) {utf16_is_leading_surrogate("es", b)}
|
|
#[bench] fn utf16_is_leading_surrogate_ru(b: &mut Bencher) {utf16_is_leading_surrogate("ru", b)}
|
|
#[bench] fn utf16_is_leading_surrogate_zh(b: &mut Bencher) {utf16_is_leading_surrogate("zh", b)}
|
|
|
|
fn utf16_needs_extra_unit(language: &str, b: &mut Bencher) {
|
|
let string = &EQUAL_UNITS[language];
|
|
let chars = string.chars().count();
|
|
let string: Vec<u16> = string.chars().map(|c| c.to_utf16() ).collect();
|
|
b.iter(|| {
|
|
let mut i = 0;
|
|
let mut loops = 0;
|
|
let mut errors = 0;
|
|
while i < string.len() {
|
|
i += match string[i].utf16_needs_extra_unit() {
|
|
Ok(true) => 2,
|
|
Ok(false) => 1,
|
|
Err(_) => {errors+=1; 1}
|
|
};
|
|
loops += 1;
|
|
}
|
|
assert_eq!(loops, chars);
|
|
assert_eq!(errors, 0);
|
|
});
|
|
}
|
|
#[bench] fn utf16_needs_extra_unit_ascii(b: &mut Bencher) {utf16_needs_extra_unit("ascii", b)}
|
|
#[bench] fn utf16_needs_extra_unit_en(b: &mut Bencher) {utf16_needs_extra_unit("en", b)}
|
|
#[bench] fn utf16_needs_extra_unit_es(b: &mut Bencher) {utf16_needs_extra_unit("es", b)}
|
|
#[bench] fn utf16_needs_extra_unit_ru(b: &mut Bencher) {utf16_needs_extra_unit("ru", b)}
|
|
#[bench] fn utf16_needs_extra_unit_zh(b: &mut Bencher) {utf16_needs_extra_unit("zh", b)}
|