handles_utils/converter.rs
1//! # Handle Converter
2//!
3//! `handle_converter` provides functions to detect confusable Unicode characters in a
4//! given input string and return its canonical form.
5
6use unicode_normalization::{char::is_combining_mark, UnicodeNormalization};
7extern crate alloc;
8use crate::types::HandleSuffix;
9use alloc::{
10 string::{String, ToString},
11 vec::Vec,
12};
13
14// Will load `CONFUSABLES` with all the confusables at build time.
15// See build.rs
16include!(concat!(env!("OUT_DIR"), "/confusables.rs"));
17
18/// Delimiter for handles
19pub const HANDLE_DELIMITER: char = '.';
20
21/// Creates a new `HandleConverter` instance with a built confusables map.
22/// Converts a given string to its canonical form by stripping Unicode whitespace,
23/// replacing confusable characters, and stripping diacritical marks.
24/// The resulting string is converted to lowercase ASCII characters.
25///
26/// # Arguments
27///
28/// * `input_str` - The input string to convert to canonical form.
29///
30/// # Returns
31///
32/// A new string in canonical form.
33///
34pub fn convert_to_canonical(input_str: &str) -> alloc::string::String {
35 let white_space_stripped = strip_unicode_whitespace(input_str);
36 let diacriticals_stripped = strip_diacriticals(&white_space_stripped);
37 let confusables_removed = replace_confusables(&diacriticals_stripped);
38 confusables_removed.to_lowercase()
39}
40
41/// Replaces any characters in the input string that are confusable with a different character.
42///
43/// # Arguments
44///
45/// * `input_str` - A reference to the input string to replace confusable characters.
46///
47/// # Returns
48///
49/// A new string where any confusable characters have been replaced with their corresponding non-confusable characters.
50pub fn replace_confusables(input_str: &str) -> alloc::string::String {
51 input_str
52 .chars()
53 .map(|character| CONFUSABLES.get(&character).map_or(character, |&value| value))
54 .collect::<alloc::string::String>()
55}
56
57/// This function removes diacritical marks from the input string and returns a new `String` without them.
58///
59/// # Arguments
60/// * `input_str` - A string slice that contains the input string from which the diacritical marks need to be removed.
61///
62/// # Notes
63/// This function uses the NFKD normalization form and filtering of combining marks to strip the diacritical marks from the input string.
64/// Combining marks are Unicode characters that are intended to modify the appearance of another character.
65///
66pub fn strip_diacriticals(input_str: &str) -> alloc::string::String {
67 input_str
68 .nfkd()
69 .filter(|character| !is_combining_mark(*character))
70 .collect::<alloc::string::String>()
71}
72
73/// Splits the given display name into its base handle and handle suffix.
74///
75/// # Arguments
76///
77/// * `display_name_str` - The display name to split.
78///
79/// # Returns
80///
81/// A tuple containing the base handle string and the handle suffix as a `HandleSuffix` enum.
82///
83pub fn split_display_name(display_name_str: &str) -> Option<(String, HandleSuffix)> {
84 let parts: Vec<&str> = display_name_str.split(HANDLE_DELIMITER).collect();
85 let base_handle_str = parts[0].to_string();
86 if parts.len() != 2 {
87 return None;
88 }
89
90 let suffix = parts[1];
91 let suffix_num = suffix.parse::<u16>().ok()?;
92
93 Some((base_handle_str, suffix_num))
94}
95
96/// Strips any Unicode whitespace characters from the provided string and returns the resulting string.
97///
98/// # Arguments
99///
100/// * `input_str` - A string slice that holds the input string from which the Unicode whitespace characters need to be stripped.
101///
102/// # Returns
103///
104/// A new string without any Unicode whitespace characters.
105pub fn strip_unicode_whitespace(input_str: &str) -> String {
106 input_str
107 .chars()
108 // U+200C is a zero-width Non-joiner needed for some writing systems
109 // U+200D is a zero-width joiner needed for some writing systems
110 .filter(|character| {
111 !character.is_whitespace() && character.ne(&'\u{200C}') && character.ne(&'\u{200D}')
112 })
113 .collect::<alloc::string::String>()
114}
115
116/// Trims whitespace from the head and tail and collapses all other whitespace to just a single space
117///
118/// # Arguments
119///
120/// * `input_str` - A string slice that holds the input string from which the whitespace characters need to be trimmed and collapsed
121///
122/// # Returns
123///
124/// A new string without any Unicode whitespace characters.
125pub fn trim_and_collapse_whitespace(input_str: &str) -> String {
126 // Benchmarked as slightly faster than https://crates.io/crates/collapse
127 input_str
128 .split_whitespace()
129 .filter(|s| !s.is_empty())
130 .collect::<Vec<_>>()
131 .join(" ")
132}