/* Copyright 2018-2022 Torbjørn Birch Moltu * * Licensed under the Apache License, Version 2.0, or the MIT license , at your option. This file may not be * copied, modified, or distributed except according to those terms. */ // Run with -- --nocapture to show error messages if setup fails. // (or use ./do.sh) #![cfg(feature="std")] #![feature(test)] extern crate test; use test::{Bencher, black_box}; use std::fs; use std::path::Path; use std::io::ErrorKind; use std::thread::sleep; use std::time::Duration; use std::collections::HashMap; extern crate minreq; #[macro_use] extern crate lazy_static; extern crate encode_unicode; use encode_unicode::{CharExt, Utf8Char, U8UtfExt, Utf16Char, U16UtfExt}; // Setup; need longish strings to make benchmarks representative and // reduce overhead (might get cache misses now though) // Therefore we download a few wikipedia articles in different languages. // Downloading a fixed revision of the articles doesn't prevent the HTML from // changing due to changes in templates or rendering. fn load_wikipedia(language: &str, article: &str, english: &str, revision: usize) -> String { let cache_path = Path::new("benches").join("texts"); let cache_path = cache_path.to_str().unwrap(); let name = format!("{}_{}.html", language, english); let path = Path::new(cache_path).join(&name); let path = path.to_str().unwrap(); match fs::read_to_string(path) { Ok(content) => return content, Err(ref e) if e.kind() == ErrorKind::NotFound => {},//continue Err(ref e) if e.kind() == ErrorKind::InvalidData => { panic!("{} exists but is not UTF-8", &name); }, Err(e) => panic!("{} exists but cannot be read ({})", path, e), } let mut article_ascii = String::new(); for c in article.chars() { if c.is_ascii() { article_ascii.push(c); } else { let encoded = format!("%{:2X}", c as u32); article_ascii.push_str(encoded.as_str()); } } let url = format!("https://{}.m.wikipedia.org/w/index.php?title={}&oldid={}", language, article_ascii, revision ); println!("Downloading {} and saving to {}", &url, path); let response = minreq::get(&url).send().unwrap_or_else(|e| { panic!("Cannot get {}: {}", url, e); }); if response.status_code != 200 { panic!("Bad URL {}: {} {}", url, response.status_code, response.reason_phrase); } let content = String::from_utf8(response.into_bytes()).unwrap_or_else(|_| { panic!("Response from {} is not UTF-8", url); }); if let Err(e) = fs::create_dir_all(cache_path) { eprintln!("Warning: failed to create directory {}: {}", cache_path, e); } else if let Err(e) = fs::write(&path, &content) { eprintln!("Warning: failed to save {}: {}", path, e); } sleep(Duration::from_secs(1)); content } const ARTICLES: &[(&str, &str, &str, usize)] = &[ ("en", "United_Kingdom", "United_Kingdom", 855522252),// 99,7% ASCII ("es", "España", "Spain", 109861222),// 1,75% 2-byte characters ("ru", "Россия", "Russia", 94607243),// 36% 2-byte characters ("zh", "中國", "China", 50868604),// 30% 3-byte characters ]; lazy_static!{ static ref STRINGS: HashMap<&'static str, String> = { let mut content = HashMap::new(); for &(language, article, english, revision) in ARTICLES { content.insert(language, load_wikipedia(language, article, english, revision)); } // make one string with only ASCII let only_ascii = content.values() .map(|v| (v, v.bytes().filter(|b| b.is_ascii() ).count()) ) .max_by_key(|&(_,len)| len ) .map(|(v,_)| v.bytes().filter(|b| b.is_ascii() ).map(|b| b as char ).collect() ) .unwrap(); content.insert("ascii", only_ascii); content }; static ref EQUAL_CHARS: HashMap<&'static str, &'static str> = { let (least, chars) = STRINGS.iter() .map(|(l,s)| (l, s.chars().count()) ) .min_by_key(|&(_,chars)| chars ) .unwrap(); println!("chars: {} (limited by {})", chars, least); STRINGS.iter().map(|(&language, string)| { let cut = string.char_indices() .nth(chars) .map_or(string.len(), |(i,_)| i ); let string = &string[..cut]; assert_eq!(string.chars().count(), chars); (language, string) }).collect() }; static ref EQUAL_BYTES: HashMap<&'static str, String> = { let (least, bytes) = STRINGS.iter() .map(|(l,s)| (l, s.len()) ) .min_by_key(|&(_,bytes)| bytes ) .unwrap(); println!("bytes: {} (limited by {})", bytes, least); STRINGS.iter().map(|(&language, string)| { let mut remaining = bytes; // take just so many characters that their length is exactly $bytes // slicing won't if !string.is_char_boundary(bytes), let string = string.chars().filter(|c| { match remaining.checked_sub(c.len_utf8()) { Some(after) => {remaining = after; true}, None => false } }).collect::(); assert_eq!(string.len(), bytes); (language, string) }).collect() }; static ref EQUAL_UNITS: HashMap<&'static str, String> = { let (least, units) = STRINGS.iter() .map(|(l,s)| (l, s.chars().map(|c| c.len_utf16() ).sum::()) ) .min_by_key(|&(_,units)| units ) .unwrap(); println!("units: {} (limited by {})", units, least); STRINGS.iter().map(|(&language, string)| { let mut remaining = units; let string = string.chars().filter(|c| { match remaining.checked_sub(c.len_utf16()) { Some(after) => {remaining = after; true}, None => false } }).collect::(); assert_eq!(string.chars().map(|c| c.len_utf16() ).sum::(), units); (language, string) }).collect() }; } /////////////////////////// // benchmarks begin here // /////////////////////////// fn utf8char_len(language: &str, b: &mut Bencher) { let string = &EQUAL_BYTES[language]; let chars: Vec = string.chars().map(|c| c.to_utf8() ).collect(); let bytes = string.len(); b.iter(|| { let sum: usize = black_box(&chars).iter().map(|u8c| u8c.len() ).sum(); assert_eq!(sum, bytes); }); } #[bench] fn utf8char_len_ascii(b: &mut Bencher) {utf8char_len("ascii", b)} #[bench] fn utf8char_len_en(b: &mut Bencher) {utf8char_len("en", b)} #[bench] fn utf8char_len_es(b: &mut Bencher) {utf8char_len("es", b)} #[bench] fn utf8char_len_ru(b: &mut Bencher) {utf8char_len("ru", b)} #[bench] fn utf8char_len_zh(b: &mut Bencher) {utf8char_len("zh", b)} fn utf8_extra_bytes_unchecked(language: &str, b: &mut Bencher) { let string = &EQUAL_CHARS[language]; let chars = string.chars().count(); let string = string.as_bytes(); b.iter(|| { let mut i = 0; let mut loops = 0; while i < string.len() { i += string[i].extra_utf8_bytes_unchecked(); i += 1; loops += 1; } assert_eq!(loops, chars); }); } #[bench] fn utf8_extra_bytes_unchecked_ascii(b: &mut Bencher) {utf8_extra_bytes_unchecked("ascii", b)} #[bench] fn utf8_extra_bytes_unchecked_en(b: &mut Bencher) {utf8_extra_bytes_unchecked("en", b)} #[bench] fn utf8_extra_bytes_unchecked_es(b: &mut Bencher) {utf8_extra_bytes_unchecked("es", b)} #[bench] fn utf8_extra_bytes_unchecked_ru(b: &mut Bencher) {utf8_extra_bytes_unchecked("ru", b)} #[bench] fn utf8_extra_bytes_unchecked_zh(b: &mut Bencher) {utf8_extra_bytes_unchecked("zh", b)} fn utf8_extra_bytes(language: &str, b: &mut Bencher) { let string = &EQUAL_CHARS[language]; let chars = string.chars().count(); let string = string.as_bytes(); b.iter(|| { let mut i = 0; let mut loops = 0; let mut errors = 0; while i < string.len() { match string[i].extra_utf8_bytes() { Ok(n) => i += n, Err(_) => errors += 1, } i += 1; loops += 1; } assert_eq!(loops, chars); assert_eq!(errors, 0); }); } #[bench] fn utf8_extra_bytes_ascii(b: &mut Bencher) {utf8_extra_bytes("ascii", b)} #[bench] fn utf8_extra_bytes_en(b: &mut Bencher) {utf8_extra_bytes("en", b)} #[bench] fn utf8_extra_bytes_es(b: &mut Bencher) {utf8_extra_bytes("es", b)} #[bench] fn utf8_extra_bytes_ru(b: &mut Bencher) {utf8_extra_bytes("ru", b)} #[bench] fn utf8_extra_bytes_zh(b: &mut Bencher) {utf8_extra_bytes("zh", b)} fn utf16char_len(language: &str, b: &mut Bencher) { let string = &EQUAL_UNITS[language]; let chars: Vec = string.chars().map(|c| c.to_utf16() ).collect(); let units = string.chars().map(|c| c.len_utf16() ).sum::(); b.iter(|| { let sum: usize = black_box(&chars).iter().map(|u8c| u8c.len() ).sum(); assert_eq!(sum, units); }); } #[bench] fn utf16char_len_ascii(b: &mut Bencher) {utf16char_len("ascii", b)} #[bench] fn utf16char_len_en(b: &mut Bencher) {utf16char_len("en", b)} #[bench] fn utf16char_len_es(b: &mut Bencher) {utf16char_len("en", b)} #[bench] fn utf16char_len_ru(b: &mut Bencher) {utf16char_len("ru", b)} #[bench] fn utf16char_len_zh(b: &mut Bencher) {utf16char_len("zh", b)} fn utf16_is_leading_surrogate(language: &str, b: &mut Bencher) { let string = &EQUAL_UNITS[language]; let chars = string.chars().count(); let string: Vec = string.chars().map(|c| c.to_utf16() ).collect(); b.iter(|| { let mut i = 0; let mut loops = 0; while i < string.len() { i += if string[i].is_utf16_leading_surrogate() {2} else {1}; loops += 1; } assert_eq!(loops, chars); }); } #[bench] fn utf16_is_leading_surrogate_ascii(b: &mut Bencher) {utf16_is_leading_surrogate("ascii", b)} #[bench] fn utf16_is_leading_surrogate_en(b: &mut Bencher) {utf16_is_leading_surrogate("en", b)} #[bench] fn utf16_is_leading_surrogate_es(b: &mut Bencher) {utf16_is_leading_surrogate("es", b)} #[bench] fn utf16_is_leading_surrogate_ru(b: &mut Bencher) {utf16_is_leading_surrogate("ru", b)} #[bench] fn utf16_is_leading_surrogate_zh(b: &mut Bencher) {utf16_is_leading_surrogate("zh", b)} fn utf16_needs_extra_unit(language: &str, b: &mut Bencher) { let string = &EQUAL_UNITS[language]; let chars = string.chars().count(); let string: Vec = string.chars().map(|c| c.to_utf16() ).collect(); b.iter(|| { let mut i = 0; let mut loops = 0; let mut errors = 0; while i < string.len() { i += match string[i].utf16_needs_extra_unit() { Ok(true) => 2, Ok(false) => 1, Err(_) => {errors+=1; 1} }; loops += 1; } assert_eq!(loops, chars); assert_eq!(errors, 0); }); } #[bench] fn utf16_needs_extra_unit_ascii(b: &mut Bencher) {utf16_needs_extra_unit("ascii", b)} #[bench] fn utf16_needs_extra_unit_en(b: &mut Bencher) {utf16_needs_extra_unit("en", b)} #[bench] fn utf16_needs_extra_unit_es(b: &mut Bencher) {utf16_needs_extra_unit("es", b)} #[bench] fn utf16_needs_extra_unit_ru(b: &mut Bencher) {utf16_needs_extra_unit("ru", b)} #[bench] fn utf16_needs_extra_unit_zh(b: &mut Bencher) {utf16_needs_extra_unit("zh", b)}